xref: /reactos/sdk/include/vcruntime/emmintrin.h (revision 84344399)
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #pragma once
11 #ifndef _INCLUDED_EMM
12 #define _INCLUDED_EMM
13 
14 #include <crtdefs.h>
15 #include <xmmintrin.h>
16 
17 #if defined(_MSC_VER) && !defined(__clang__)
18 
19 typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128i
20 {
21     __int8  m128i_i8[16];
22     __int16 m128i_i16[8];
23     __int32 m128i_i32[4];
24     __int64 m128i_i64[2];
25     unsigned __int8  m128i_u8[16];
26     unsigned __int16 m128i_u16[8];
27     unsigned __int32 m128i_u32[4];
28     unsigned __int64 m128i_u64[2];
29 } __m128i;
30 _STATIC_ASSERT(sizeof(__m128i) == 16);
31 
32 typedef struct _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128d
33 {
34     double m128d_f64[2];
35 } __m128d;
36 
37 typedef __declspec(align(1)) __m128i __m128i_u;
38 
39 #define __ATTRIBUTE_SSE2__
40 
41 #else /* _MSC_VER */
42 
43 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
45 
46 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
47 typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
48 
49 /* Type defines.  */
50 typedef double __v2df __attribute__((__vector_size__(16)));
51 typedef long long __v2di __attribute__((__vector_size__(16)));
52 typedef short __v8hi __attribute__((__vector_size__(16)));
53 typedef char __v16qi __attribute__((__vector_size__(16)));
54 
55 /* Unsigned types */
56 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
57 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
58 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
59 
60 /* We need an explicitly signed variant for char. Note that this shouldn't
61  * appear in the interface though. */
62 typedef signed char __v16qs __attribute__((__vector_size__(16)));
63 
64 #ifdef __clang__
65 #define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"),__min_vector_width__(128)))
66 #else
67 #define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2")))
68 #endif
69 #define __INTRIN_INLINE_SSE2 __INTRIN_INLINE __ATTRIBUTE_SSE2__
70 
71 #endif /* _MSC_VER */
72 
73 extern __m128d _mm_add_sd(__m128d a, __m128d b);
74 extern __m128d _mm_add_pd(__m128d a, __m128d b);
75 extern __m128d _mm_sub_sd(__m128d a, __m128d b);
76 extern __m128d _mm_sub_pd(__m128d a, __m128d b);
77 extern __m128d _mm_mul_sd(__m128d a, __m128d b);
78 extern __m128d _mm_mul_pd(__m128d a, __m128d b);
79 extern __m128d _mm_div_sd(__m128d a, __m128d b);
80 extern __m128d _mm_div_pd(__m128d a, __m128d b);
81 extern __m128d _mm_sqrt_sd(__m128d a, __m128d b);
82 extern __m128d _mm_sqrt_pd(__m128d a);
83 extern __m128d _mm_min_sd(__m128d a, __m128d b);
84 extern __m128d _mm_min_pd(__m128d a, __m128d b);
85 extern __m128d _mm_max_sd(__m128d a, __m128d b);
86 extern __m128d _mm_max_pd(__m128d a, __m128d b);
87 extern __m128d _mm_and_pd(__m128d a, __m128d b);
88 extern __m128d _mm_andnot_pd(__m128d a, __m128d b);
89 extern __m128d _mm_or_pd(__m128d a, __m128d b);
90 extern __m128d _mm_xor_pd(__m128d a, __m128d b);
91 extern __m128d _mm_cmpeq_pd(__m128d a, __m128d b);
92 extern __m128d _mm_cmplt_pd(__m128d a, __m128d b);
93 extern __m128d _mm_cmple_pd(__m128d a, __m128d b);
94 extern __m128d _mm_cmpgt_pd(__m128d a, __m128d b);
95 extern __m128d _mm_cmpge_pd(__m128d a, __m128d b);
96 extern __m128d _mm_cmpord_pd(__m128d a, __m128d b);
97 extern __m128d _mm_cmpunord_pd(__m128d a, __m128d b);
98 extern __m128d _mm_cmpneq_pd(__m128d a, __m128d b);
99 extern __m128d _mm_cmpnlt_pd(__m128d a, __m128d b);
100 extern __m128d _mm_cmpnle_pd(__m128d a, __m128d b);
101 extern __m128d _mm_cmpngt_pd(__m128d a, __m128d b);
102 extern __m128d _mm_cmpnge_pd(__m128d a, __m128d b);
103 extern __m128d _mm_cmpeq_sd(__m128d a, __m128d b);
104 extern __m128d _mm_cmplt_sd(__m128d a, __m128d b);
105 extern __m128d _mm_cmple_sd(__m128d a, __m128d b);
106 extern __m128d _mm_cmpgt_sd(__m128d a, __m128d b);
107 extern __m128d _mm_cmpge_sd(__m128d a, __m128d b);
108 extern __m128d _mm_cmpord_sd(__m128d a, __m128d b);
109 extern __m128d _mm_cmpunord_sd(__m128d a, __m128d b);
110 extern __m128d _mm_cmpneq_sd(__m128d a, __m128d b);
111 extern __m128d _mm_cmpnlt_sd(__m128d a, __m128d b);
112 extern __m128d _mm_cmpnle_sd(__m128d a, __m128d b);
113 extern __m128d _mm_cmpngt_sd(__m128d a, __m128d b);
114 extern __m128d _mm_cmpnge_sd(__m128d a, __m128d b);
115 extern int _mm_comieq_sd(__m128d a, __m128d b);
116 extern int _mm_comilt_sd(__m128d a, __m128d b);
117 extern int _mm_comile_sd(__m128d a, __m128d b);
118 extern int _mm_comigt_sd(__m128d a, __m128d b);
119 extern int _mm_comige_sd(__m128d a, __m128d b);
120 extern int _mm_comineq_sd(__m128d a, __m128d b);
121 extern int _mm_ucomieq_sd(__m128d a, __m128d b);
122 extern int _mm_ucomilt_sd(__m128d a, __m128d b);
123 extern int _mm_ucomile_sd(__m128d a, __m128d b);
124 extern int _mm_ucomigt_sd(__m128d a, __m128d b);
125 extern int _mm_ucomige_sd(__m128d a, __m128d b);
126 extern int _mm_ucomineq_sd(__m128d a, __m128d b);
127 extern __m128 _mm_cvtpd_ps(__m128d a);
128 extern __m128d _mm_cvtps_pd(__m128 a);
129 extern __m128d _mm_cvtepi32_pd(__m128i a);
130 extern __m128i _mm_cvtpd_epi32(__m128d a);
131 extern int _mm_cvtsd_si32(__m128d a);
132 extern __m128 _mm_cvtsd_ss(__m128 a, __m128d b);
133 extern __m128d _mm_cvtsi32_sd(__m128d a, int b);
134 extern __m128d _mm_cvtss_sd(__m128d a, __m128 b);
135 extern __m128i _mm_cvttpd_epi32(__m128d a);
136 extern int _mm_cvttsd_si32(__m128d a);
137 extern __m64 _mm_cvtpd_pi32(__m128d a);
138 extern __m64 _mm_cvttpd_pi32(__m128d a);
139 extern __m128d _mm_cvtpi32_pd(__m64 a);
140 extern double _mm_cvtsd_f64(__m128d a);
141 extern __m128d _mm_load_pd(double const *dp);
142 extern __m128d _mm_load1_pd(double const *dp);
143 extern __m128d _mm_loadr_pd(double const *dp);
144 extern __m128d _mm_loadu_pd(double const *dp);
145 //extern __m128i _mm_loadu_si64(void const *a);
146 //extern __m128i _mm_loadu_si32(void const *a);
147 //extern __m128i _mm_loadu_si16(void const *a);
148 extern __m128d _mm_load_sd(double const *dp);
149 extern __m128d _mm_loadh_pd(__m128d a, double const *dp);
150 extern __m128d _mm_loadl_pd(__m128d a, double const *dp);
151 //extern __m128d _mm_undefined_pd(void);
152 extern __m128d _mm_set_sd(double w);
153 extern __m128d _mm_set1_pd(double w);
154 extern __m128d _mm_set_pd(double w, double x);
155 extern __m128d _mm_setr_pd(double w, double x);
156 extern __m128d _mm_setzero_pd(void);
157 extern __m128d _mm_move_sd(__m128d a, __m128d b);
158 extern void _mm_store_sd(double *dp, __m128d a);
159 extern void _mm_store_pd(double *dp, __m128d a);
160 extern void _mm_store1_pd(double *dp, __m128d a);
161 extern void _mm_storeu_pd(double *dp, __m128d a);
162 extern void _mm_storer_pd(double *dp, __m128d a);
163 extern void _mm_storeh_pd(double *dp, __m128d a);
164 extern void _mm_storel_pd(double *dp, __m128d a);
165 extern __m128i _mm_add_epi8(__m128i a, __m128i b);
166 extern __m128i _mm_add_epi16(__m128i a, __m128i b);
167 extern __m128i _mm_add_epi32(__m128i a, __m128i b);
168 extern __m64 _mm_add_si64(__m64 a, __m64 b);
169 extern __m128i _mm_add_epi64(__m128i a, __m128i b);
170 extern __m128i _mm_adds_epi8(__m128i a, __m128i b);
171 extern __m128i _mm_adds_epi16(__m128i a, __m128i b);
172 extern __m128i _mm_adds_epu8(__m128i a, __m128i b);
173 extern __m128i _mm_adds_epu16(__m128i a, __m128i b);
174 extern __m128i _mm_avg_epu8(__m128i a, __m128i b);
175 extern __m128i _mm_avg_epu16(__m128i a, __m128i b);
176 extern __m128i _mm_madd_epi16(__m128i a, __m128i b);
177 extern __m128i _mm_max_epi16(__m128i a, __m128i b);
178 extern __m128i _mm_max_epu8(__m128i a, __m128i b);
179 extern __m128i _mm_min_epi16(__m128i a, __m128i b);
180 extern __m128i _mm_min_epu8(__m128i a, __m128i b);
181 extern __m128i _mm_mulhi_epi16(__m128i a, __m128i b);
182 extern __m128i _mm_mulhi_epu16(__m128i a, __m128i b);
183 extern __m128i _mm_mullo_epi16(__m128i a, __m128i b);
184 extern __m64 _mm_mul_su32(__m64 a, __m64 b);
185 extern __m128i _mm_mul_epu32(__m128i a, __m128i b);
186 extern __m128i _mm_sad_epu8(__m128i a, __m128i b);
187 extern __m128i _mm_sub_epi8(__m128i a, __m128i b);
188 extern __m128i _mm_sub_epi16(__m128i a, __m128i b);
189 extern __m128i _mm_sub_epi32(__m128i a, __m128i b);
190 extern __m64 _mm_sub_si64(__m64 a, __m64 b);
191 extern __m128i _mm_sub_epi64(__m128i a, __m128i b);
192 extern __m128i _mm_subs_epi8(__m128i a, __m128i b);
193 extern __m128i _mm_subs_epi16(__m128i a, __m128i b);
194 extern __m128i _mm_subs_epu8(__m128i a, __m128i b);
195 extern __m128i _mm_subs_epu16(__m128i a, __m128i b);
196 extern __m128i _mm_and_si128(__m128i a, __m128i b);
197 extern __m128i _mm_andnot_si128(__m128i a, __m128i b);
198 extern __m128i _mm_or_si128(__m128i a, __m128i b);
199 extern __m128i _mm_xor_si128(__m128i a, __m128i b);
200 extern __m128i _mm_slli_si128(__m128i a, int i);
201 extern __m128i _mm_slli_epi16(__m128i a, int count);
202 extern __m128i _mm_sll_epi16(__m128i a, __m128i count);
203 extern __m128i _mm_slli_epi32(__m128i a, int count);
204 extern __m128i _mm_sll_epi32(__m128i a, __m128i count);
205 extern __m128i _mm_slli_epi64(__m128i a, int count);
206 extern __m128i _mm_sll_epi64(__m128i a, __m128i count);
207 extern __m128i _mm_srai_epi16(__m128i a, int count);
208 extern __m128i _mm_sra_epi16(__m128i a, __m128i count);
209 extern __m128i _mm_srai_epi32(__m128i a, int count);
210 extern __m128i _mm_sra_epi32(__m128i a, __m128i count);
211 extern __m128i _mm_srli_si128(__m128i a, int imm);
212 extern __m128i _mm_srli_epi16(__m128i a, int count);
213 extern __m128i _mm_srl_epi16(__m128i a, __m128i count);
214 extern __m128i _mm_srli_epi32(__m128i a, int count);
215 extern __m128i _mm_srl_epi32(__m128i a, __m128i count);
216 extern __m128i _mm_srli_epi64(__m128i a, int count);
217 extern __m128i _mm_srl_epi64(__m128i a, __m128i count);
218 extern __m128i _mm_cmpeq_epi8(__m128i a, __m128i b);
219 extern __m128i _mm_cmpeq_epi16(__m128i a, __m128i b);
220 extern __m128i _mm_cmpeq_epi32(__m128i a, __m128i b);
221 extern __m128i _mm_cmpgt_epi8(__m128i a, __m128i b);
222 extern __m128i _mm_cmpgt_epi16(__m128i a, __m128i b);
223 extern __m128i _mm_cmpgt_epi32(__m128i a, __m128i b);
224 extern __m128i _mm_cmplt_epi8(__m128i a, __m128i b);
225 extern __m128i _mm_cmplt_epi16(__m128i a, __m128i b);
226 extern __m128i _mm_cmplt_epi32(__m128i a, __m128i b);
227 #ifdef _M_AMD64
228 extern __m128d _mm_cvtsi64_sd(__m128d a, long long b);
229 extern long long _mm_cvtsd_si64(__m128d a);
230 extern long long _mm_cvttsd_si64(__m128d a);
231 #endif
232 extern __m128 _mm_cvtepi32_ps(__m128i a);
233 extern __m128i _mm_cvtps_epi32(__m128 a);
234 extern __m128i _mm_cvttps_epi32(__m128 a);
235 extern __m128i _mm_cvtsi32_si128(int a);
236 #ifdef _M_AMD64
237 extern __m128i _mm_cvtsi64_si128(long long a);
238 #endif
239 extern int _mm_cvtsi128_si32(__m128i a);
240 #ifdef _M_AMD64
241 extern long long _mm_cvtsi128_si64(__m128i a);
242 #endif
243 extern __m128i _mm_load_si128(__m128i const *p);
244 extern __m128i _mm_loadu_si128(__m128i_u const *p);
245 extern __m128i _mm_loadl_epi64(__m128i_u const *p);
246 //extern __m128i _mm_undefined_si128(void);
247 //extern __m128i _mm_set_epi64x(long long q1, long long q0); // FIXME
248 extern __m128i _mm_set_epi64(__m64 q1, __m64 q0);
249 //extern __m128i _mm_set_epi32(int i3, int i1, int i0);
250 extern __m128i _mm_set_epi32(int i3, int i2, int i1, int i0);
251 //extern __m128i _mm_set_epi16(short w7, short w2, short w1, short w0);
252 extern __m128i _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0);
253 //extern __m128i _mm_set_epi8(char b15, char b10, char b4, char b3, char b2, char b1, char b0);
254 extern __m128i _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
255 //extern __m128i _mm_set1_epi64x(long long q); // FIXME
256 extern __m128i _mm_set1_epi64(__m64 q);
257 extern __m128i _mm_set1_epi32(int i);
258 extern __m128i _mm_set1_epi16(short w);
259 extern __m128i _mm_set1_epi8(char b);
260 extern __m128i _mm_setl_epi64(__m128i q); // FIXME: clang?
261 extern __m128i _mm_setr_epi64(__m64 q0, __m64 q1);
262 //extern __m128i _mm_setr_epi32(int i0, int i2, int i3);
263 extern __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3);
264 //extern __m128i _mm_setr_epi16(short w0, short w5, short w6, short w7);
265 extern __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7);
266 //extern __m128i _mm_setr_epi8(char b0, char b6, char b11, char b12, char b13, char b14, char b15);
267 extern __m128i _mm_setr_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
268 extern __m128i _mm_setzero_si128(void);
269 extern void _mm_store_si128(__m128i *p, __m128i b);
270 extern void _mm_storeu_si128(__m128i_u *p, __m128i b);
271 //extern void _mm_storeu_si64(void *p, __m128i b);
272 //extern void _mm_storeu_si32(void *p, __m128i b);
273 //extern void _mm_storeu_si16(void *p, __m128i b);
274 extern void _mm_maskmoveu_si128(__m128i d, __m128i n, _Out_writes_bytes_(16) char *p);
275 extern void _mm_storel_epi64(__m128i_u *p, __m128i a);
276 extern void _mm_stream_pd(double *p, __m128d a);
277 extern void _mm_stream_si128(__m128i *p, __m128i a);
278 extern void _mm_stream_si32(int *p, int a);
279 extern void _mm_clflush(void const *p);
280 extern void _mm_lfence(void);
281 extern void _mm_mfence(void);
282 extern __m128i _mm_packs_epi16(__m128i a, __m128i b);
283 extern __m128i _mm_packs_epi32(__m128i a, __m128i b);
284 extern __m128i _mm_packus_epi16(__m128i a, __m128i b);
285 extern int _mm_extract_epi16(__m128i a, int imm);
286 extern __m128i _mm_insert_epi16(__m128i a, int b, int imm);
287 extern int _mm_movemask_epi8(__m128i a);
288 extern __m128i _mm_shuffle_epi32(__m128i a, int imm);
289 extern __m128i _mm_shufflelo_epi16(__m128i a, int imm);
290 extern __m128i _mm_shufflehi_epi16(__m128i a, int imm);
291 extern __m128i _mm_unpackhi_epi8(__m128i a, __m128i b);
292 extern __m128i _mm_unpackhi_epi16(__m128i a, __m128i b);
293 extern __m128i _mm_unpackhi_epi32(__m128i a, __m128i b);
294 extern __m128i _mm_unpackhi_epi64(__m128i a, __m128i b);
295 extern __m128i _mm_unpacklo_epi8(__m128i a, __m128i b);
296 extern __m128i _mm_unpacklo_epi16(__m128i a, __m128i b);
297 extern __m128i _mm_unpacklo_epi32(__m128i a, __m128i b);
298 extern __m128i _mm_unpacklo_epi64(__m128i a, __m128i b);
299 extern __m64 _mm_movepi64_pi64(__m128i a);
300 extern __m128i _mm_movpi64_epi64(__m64 a);
301 extern __m128i _mm_move_epi64(__m128i a);
302 extern __m128d _mm_unpackhi_pd(__m128d a, __m128d b);
303 extern __m128d _mm_unpacklo_pd(__m128d a, __m128d b);
304 extern int _mm_movemask_pd(__m128d a);
305 extern __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm);
306 extern __m128 _mm_castpd_ps(__m128d a);
307 extern __m128i _mm_castpd_si128(__m128d a);
308 extern __m128d _mm_castps_pd(__m128 a);
309 extern __m128i _mm_castps_si128(__m128 a);
310 extern __m128 _mm_castsi128_ps(__m128i a);
311 extern __m128d _mm_castsi128_pd(__m128i a);
312 void _mm_pause(void);
313 
314 /* Alternate names */
315 #define _mm_set_pd1(a) _mm_set1_pd(a)
316 #define _mm_load_pd1(p) _mm_load1_pd(p)
317 #define _mm_store_pd1(p, a) _mm_store1_pd((p), (a))
318 #define _mm_bslli_si128 _mm_slli_si128
319 #define _mm_bsrli_si128 _mm_srli_si128
320 #define _mm_stream_si64 _mm_stream_si64x
321 
322 #if defined(_MSC_VER) && !defined(__clang__)
323 
324 #pragma intrinsic(_mm_add_sd)
325 #pragma intrinsic(_mm_add_pd)
326 #pragma intrinsic(_mm_sub_sd)
327 #pragma intrinsic(_mm_sub_pd)
328 #pragma intrinsic(_mm_mul_sd)
329 #pragma intrinsic(_mm_mul_pd)
330 #pragma intrinsic(_mm_div_sd)
331 #pragma intrinsic(_mm_div_pd)
332 #pragma intrinsic(_mm_sqrt_sd)
333 #pragma intrinsic(_mm_sqrt_pd)
334 #pragma intrinsic(_mm_min_sd)
335 #pragma intrinsic(_mm_min_pd)
336 #pragma intrinsic(_mm_max_sd)
337 #pragma intrinsic(_mm_max_pd)
338 #pragma intrinsic(_mm_and_pd)
339 #pragma intrinsic(_mm_andnot_pd)
340 #pragma intrinsic(_mm_or_pd)
341 #pragma intrinsic(_mm_xor_pd)
342 #pragma intrinsic(_mm_cmpeq_pd)
343 #pragma intrinsic(_mm_cmplt_pd)
344 #pragma intrinsic(_mm_cmple_pd)
345 #pragma intrinsic(_mm_cmpgt_pd)
346 #pragma intrinsic(_mm_cmpge_pd)
347 #pragma intrinsic(_mm_cmpord_pd)
348 #pragma intrinsic(_mm_cmpunord_pd)
349 #pragma intrinsic(_mm_cmpneq_pd)
350 #pragma intrinsic(_mm_cmpnlt_pd)
351 #pragma intrinsic(_mm_cmpnle_pd)
352 #pragma intrinsic(_mm_cmpngt_pd)
353 #pragma intrinsic(_mm_cmpnge_pd)
354 #pragma intrinsic(_mm_cmpeq_sd)
355 #pragma intrinsic(_mm_cmplt_sd)
356 #pragma intrinsic(_mm_cmple_sd)
357 #pragma intrinsic(_mm_cmpgt_sd)
358 #pragma intrinsic(_mm_cmpge_sd)
359 #pragma intrinsic(_mm_cmpord_sd)
360 #pragma intrinsic(_mm_cmpunord_sd)
361 #pragma intrinsic(_mm_cmpneq_sd)
362 #pragma intrinsic(_mm_cmpnlt_sd)
363 #pragma intrinsic(_mm_cmpnle_sd)
364 #pragma intrinsic(_mm_cmpngt_sd)
365 #pragma intrinsic(_mm_cmpnge_sd)
366 #pragma intrinsic(_mm_comieq_sd)
367 #pragma intrinsic(_mm_comilt_sd)
368 #pragma intrinsic(_mm_comile_sd)
369 #pragma intrinsic(_mm_comigt_sd)
370 #pragma intrinsic(_mm_comige_sd)
371 #pragma intrinsic(_mm_comineq_sd)
372 #pragma intrinsic(_mm_ucomieq_sd)
373 #pragma intrinsic(_mm_ucomilt_sd)
374 #pragma intrinsic(_mm_ucomile_sd)
375 #pragma intrinsic(_mm_ucomigt_sd)
376 #pragma intrinsic(_mm_ucomige_sd)
377 #pragma intrinsic(_mm_ucomineq_sd)
378 #pragma intrinsic(_mm_cvtpd_ps)
379 #pragma intrinsic(_mm_cvtps_pd)
380 #pragma intrinsic(_mm_cvtepi32_pd)
381 #pragma intrinsic(_mm_cvtpd_epi32)
382 #pragma intrinsic(_mm_cvtsd_si32)
383 #pragma intrinsic(_mm_cvtsd_ss)
384 #pragma intrinsic(_mm_cvtsi32_sd)
385 #pragma intrinsic(_mm_cvtss_sd)
386 #pragma intrinsic(_mm_cvttpd_epi32)
387 #pragma intrinsic(_mm_cvttsd_si32)
388 //#pragma intrinsic(_mm_cvtpd_pi32)
389 //#pragma intrinsic(_mm_cvttpd_pi32)
390 //#pragma intrinsic(_mm_cvtpi32_pd)
391 #pragma intrinsic(_mm_cvtsd_f64)
392 #pragma intrinsic(_mm_load_pd)
393 #pragma intrinsic(_mm_load1_pd)
394 #pragma intrinsic(_mm_loadr_pd)
395 #pragma intrinsic(_mm_loadu_pd)
396 //#pragma intrinsic(_mm_loadu_si64)
397 //#pragma intrinsic(_mm_loadu_si32)
398 //#pragma intrinsic(_mm_loadu_si16)
399 #pragma intrinsic(_mm_load_sd)
400 #pragma intrinsic(_mm_loadh_pd)
401 #pragma intrinsic(_mm_loadl_pd)
402 //#pragma intrinsic(_mm_undefined_pd)
403 #pragma intrinsic(_mm_set_sd)
404 #pragma intrinsic(_mm_set1_pd)
405 #pragma intrinsic(_mm_set_pd)
406 #pragma intrinsic(_mm_setr_pd)
407 #pragma intrinsic(_mm_setzero_pd)
408 #pragma intrinsic(_mm_move_sd)
409 #pragma intrinsic(_mm_store_sd)
410 #pragma intrinsic(_mm_store_pd)
411 #pragma intrinsic(_mm_store1_pd)
412 #pragma intrinsic(_mm_storeu_pd)
413 #pragma intrinsic(_mm_storer_pd)
414 #pragma intrinsic(_mm_storeh_pd)
415 #pragma intrinsic(_mm_storel_pd)
416 #pragma intrinsic(_mm_add_epi8)
417 #pragma intrinsic(_mm_add_epi16)
418 #pragma intrinsic(_mm_add_epi32)
419 //#pragma intrinsic(_mm_add_si64)
420 #pragma intrinsic(_mm_add_epi64)
421 #pragma intrinsic(_mm_adds_epi8)
422 #pragma intrinsic(_mm_adds_epi16)
423 #pragma intrinsic(_mm_adds_epu8)
424 #pragma intrinsic(_mm_adds_epu16)
425 #pragma intrinsic(_mm_avg_epu8)
426 #pragma intrinsic(_mm_avg_epu16)
427 #pragma intrinsic(_mm_madd_epi16)
428 #pragma intrinsic(_mm_max_epi16)
429 #pragma intrinsic(_mm_max_epu8)
430 #pragma intrinsic(_mm_min_epi16)
431 #pragma intrinsic(_mm_min_epu8)
432 #pragma intrinsic(_mm_mulhi_epi16)
433 #pragma intrinsic(_mm_mulhi_epu16)
434 #pragma intrinsic(_mm_mullo_epi16)
435 //#pragma intrinsic(_mm_mul_su32)
436 #pragma intrinsic(_mm_mul_epu32)
437 #pragma intrinsic(_mm_sad_epu8)
438 #pragma intrinsic(_mm_sub_epi8)
439 #pragma intrinsic(_mm_sub_epi16)
440 #pragma intrinsic(_mm_sub_epi32)
441 //#pragma intrinsic(_mm_sub_si64)
442 #pragma intrinsic(_mm_sub_epi64)
443 #pragma intrinsic(_mm_subs_epi8)
444 #pragma intrinsic(_mm_subs_epi16)
445 #pragma intrinsic(_mm_subs_epu8)
446 #pragma intrinsic(_mm_subs_epu16)
447 #pragma intrinsic(_mm_and_si128)
448 #pragma intrinsic(_mm_andnot_si128)
449 #pragma intrinsic(_mm_or_si128)
450 #pragma intrinsic(_mm_xor_si128)
451 #pragma intrinsic(_mm_slli_si128)
452 #pragma intrinsic(_mm_slli_epi16)
453 #pragma intrinsic(_mm_sll_epi16)
454 #pragma intrinsic(_mm_slli_epi32)
455 #pragma intrinsic(_mm_sll_epi32)
456 #pragma intrinsic(_mm_slli_epi64)
457 #pragma intrinsic(_mm_sll_epi64)
458 #pragma intrinsic(_mm_srai_epi16)
459 #pragma intrinsic(_mm_sra_epi16)
460 #pragma intrinsic(_mm_srai_epi32)
461 #pragma intrinsic(_mm_sra_epi32)
462 #pragma intrinsic(_mm_srli_si128)
463 #pragma intrinsic(_mm_srli_epi16)
464 #pragma intrinsic(_mm_srl_epi16)
465 #pragma intrinsic(_mm_srli_epi32)
466 #pragma intrinsic(_mm_srl_epi32)
467 #pragma intrinsic(_mm_srli_epi64)
468 #pragma intrinsic(_mm_srl_epi64)
469 #pragma intrinsic(_mm_cmpeq_epi8)
470 #pragma intrinsic(_mm_cmpeq_epi16)
471 #pragma intrinsic(_mm_cmpeq_epi32)
472 #pragma intrinsic(_mm_cmpgt_epi8)
473 #pragma intrinsic(_mm_cmpgt_epi16)
474 #pragma intrinsic(_mm_cmpgt_epi32)
475 #pragma intrinsic(_mm_cmplt_epi8)
476 #pragma intrinsic(_mm_cmplt_epi16)
477 #pragma intrinsic(_mm_cmplt_epi32)
478 #ifdef _M_AMD64
479 #pragma intrinsic(_mm_cvtsi64_sd)
480 #pragma intrinsic(_mm_cvtsd_si64)
481 #pragma intrinsic(_mm_cvttsd_si64)
482 #endif
483 #pragma intrinsic(_mm_cvtepi32_ps)
484 #pragma intrinsic(_mm_cvtps_epi32)
485 #pragma intrinsic(_mm_cvttps_epi32)
486 #pragma intrinsic(_mm_cvtsi32_si128)
487 #ifdef _M_AMD64
488 #pragma intrinsic(_mm_cvtsi64_si128)
489 #endif
490 #pragma intrinsic(_mm_cvtsi128_si32)
491 #ifdef _M_AMD64
492 #pragma intrinsic(_mm_cvtsi128_si64)
493 #endif
494 #pragma intrinsic(_mm_load_si128)
495 #pragma intrinsic(_mm_loadu_si128)
496 #pragma intrinsic(_mm_loadl_epi64)
497 //#pragma intrinsic(_mm_undefined_si128)
498 #pragma intrinsic(_mm_set_epi64x)
499 //#pragma intrinsic(_mm_set_epi64)
500 #pragma intrinsic(_mm_set_epi32)
501 #pragma intrinsic(_mm_set_epi16)
502 #pragma intrinsic(_mm_set_epi8)
503 #pragma intrinsic(_mm_set1_epi64x)
504 //#pragma intrinsic(_mm_set1_epi64)
505 #pragma intrinsic(_mm_set1_epi32)
506 #pragma intrinsic(_mm_set1_epi16)
507 #pragma intrinsic(_mm_set1_epi8)
508 #pragma intrinsic(_mm_setl_epi64)
509 //#pragma intrinsic(_mm_setr_epi64)
510 #pragma intrinsic(_mm_setr_epi32)
511 #pragma intrinsic(_mm_setr_epi16)
512 #pragma intrinsic(_mm_setr_epi8)
513 #pragma intrinsic(_mm_setzero_si128)
514 #pragma intrinsic(_mm_store_si128)
515 #pragma intrinsic(_mm_storeu_si128)
516 //#pragma intrinsic(_mm_storeu_si64)
517 //#pragma intrinsic(_mm_storeu_si32)
518 //#pragma intrinsic(_mm_storeu_si16)
519 #pragma intrinsic(_mm_maskmoveu_si128)
520 #pragma intrinsic(_mm_storel_epi64)
521 #pragma intrinsic(_mm_stream_pd)
522 #pragma intrinsic(_mm_stream_si128)
523 #pragma intrinsic(_mm_stream_si32)
524 #pragma intrinsic(_mm_clflush)
525 #pragma intrinsic(_mm_lfence)
526 #pragma intrinsic(_mm_mfence)
527 #pragma intrinsic(_mm_packs_epi16)
528 #pragma intrinsic(_mm_packs_epi32)
529 #pragma intrinsic(_mm_packus_epi16)
530 #pragma intrinsic(_mm_extract_epi16)
531 #pragma intrinsic(_mm_insert_epi16)
532 #pragma intrinsic(_mm_movemask_epi8)
533 #pragma intrinsic(_mm_shuffle_epi32)
534 #pragma intrinsic(_mm_shufflelo_epi16)
535 #pragma intrinsic(_mm_shufflehi_epi16)
536 #pragma intrinsic(_mm_unpackhi_epi8)
537 #pragma intrinsic(_mm_unpackhi_epi16)
538 #pragma intrinsic(_mm_unpackhi_epi32)
539 #pragma intrinsic(_mm_unpackhi_epi64)
540 #pragma intrinsic(_mm_unpacklo_epi8)
541 #pragma intrinsic(_mm_unpacklo_epi16)
542 #pragma intrinsic(_mm_unpacklo_epi32)
543 #pragma intrinsic(_mm_unpacklo_epi64)
544 //#pragma intrinsic(_mm_movepi64_pi64)
545 //#pragma intrinsic(_mm_movpi64_epi64)
546 #pragma intrinsic(_mm_move_epi64)
547 #pragma intrinsic(_mm_unpackhi_pd)
548 #pragma intrinsic(_mm_unpacklo_pd)
549 #pragma intrinsic(_mm_movemask_pd)
550 #pragma intrinsic(_mm_shuffle_pd)
551 #pragma intrinsic(_mm_castpd_ps)
552 #pragma intrinsic(_mm_castpd_si128)
553 #pragma intrinsic(_mm_castps_pd)
554 #pragma intrinsic(_mm_castps_si128)
555 #pragma intrinsic(_mm_castsi128_ps)
556 #pragma intrinsic(_mm_castsi128_pd)
557 #pragma intrinsic(_mm_pause)
558 
559 #else /* _MSC_VER */
560 
561 /*
562   Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/emmintrin.h
563   Clang older version: https://github.com/llvm/llvm-project/blob/3ef88b31843e040c95f23ff2c3c206f1fa399c05/clang/lib/Headers/emmintrin.h
564   unikraft: https://github.com/unikraft/lib-intel-intrinsics/blob/staging/include/emmintrin.h
565 */
566 
567 __INTRIN_INLINE_SSE2 __m128d _mm_add_sd(__m128d a, __m128d b)
568 {
569     a[0] += b[0];
570     return a;
571 }
572 
573 __INTRIN_INLINE_SSE2 __m128d _mm_add_pd(__m128d a, __m128d b)
574 {
575     return (__m128d)((__v2df)a + (__v2df)b);
576 }
577 
578 __INTRIN_INLINE_SSE2 __m128d _mm_sub_sd(__m128d a, __m128d b)
579 {
580     a[0] -= b[0];
581     return a;
582 }
583 
584 __INTRIN_INLINE_SSE2 __m128d _mm_sub_pd(__m128d a, __m128d b)
585 {
586     return (__m128d)((__v2df)a - (__v2df)b);
587 }
588 
589 __INTRIN_INLINE_SSE2 __m128d _mm_mul_sd(__m128d a, __m128d b)
590 {
591     a[0] *= b[0];
592     return a;
593 }
594 
595 __INTRIN_INLINE_SSE2 __m128d _mm_mul_pd(__m128d a, __m128d b)
596 {
597     return (__m128d)((__v2df)a * (__v2df)b);
598 }
599 
600 __INTRIN_INLINE_SSE2 __m128d _mm_div_sd(__m128d a, __m128d b)
601 {
602     a[0] /= b[0];
603     return a;
604 }
605 
606 __INTRIN_INLINE_SSE2 __m128d _mm_div_pd(__m128d a, __m128d b)
607 {
608     return (__m128d)((__v2df)a / (__v2df)b);
609 }
610 
611 __INTRIN_INLINE_SSE2 __m128d _mm_sqrt_sd(__m128d a, __m128d b)
612 {
613     __m128d __c = __builtin_ia32_sqrtsd((__v2df)b);
614     return __extension__(__m128d){__c[0], a[1]};
615 }
616 
617 __INTRIN_INLINE_SSE2 __m128d _mm_sqrt_pd(__m128d a)
618 {
619     return __builtin_ia32_sqrtpd((__v2df)a);
620 }
621 
622 __INTRIN_INLINE_SSE2 __m128d _mm_min_sd(__m128d a, __m128d b)
623 {
624     return __builtin_ia32_minsd((__v2df)a, (__v2df)b);
625 }
626 
627 __INTRIN_INLINE_SSE2 __m128d _mm_min_pd(__m128d a, __m128d b)
628 {
629     return __builtin_ia32_minpd((__v2df)a, (__v2df)b);
630 }
631 
632 __INTRIN_INLINE_SSE2 __m128d _mm_max_sd(__m128d a, __m128d b)
633 {
634     return __builtin_ia32_maxsd((__v2df)a, (__v2df)b);
635 }
636 
637 __INTRIN_INLINE_SSE2 __m128d _mm_max_pd(__m128d a, __m128d b)
638 {
639     return __builtin_ia32_maxpd((__v2df)a, (__v2df)b);
640 }
641 
642 __INTRIN_INLINE_SSE2 __m128d _mm_and_pd(__m128d a, __m128d b)
643 {
644     return (__m128d)((__v2du)a & (__v2du)b);
645 }
646 
647 __INTRIN_INLINE_SSE2 __m128d _mm_andnot_pd(__m128d a, __m128d b)
648 {
649     return (__m128d)(~(__v2du)a & (__v2du)b);
650 }
651 
652 __INTRIN_INLINE_SSE2 __m128d _mm_or_pd(__m128d a, __m128d b)
653 {
654     return (__m128d)((__v2du)a | (__v2du)b);
655 }
656 
657 __INTRIN_INLINE_SSE2 __m128d _mm_xor_pd(__m128d a, __m128d b)
658 {
659     return (__m128d)((__v2du)a ^ (__v2du)b);
660 }
661 
662 __INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
663 {
664     return (__m128d)__builtin_ia32_cmpeqpd((__v2df)a, (__v2df)b);
665 }
666 
667 __INTRIN_INLINE_SSE2 __m128d _mm_cmplt_pd(__m128d a, __m128d b)
668 {
669     return (__m128d)__builtin_ia32_cmpltpd((__v2df)a, (__v2df)b);
670 }
671 
672 __INTRIN_INLINE_SSE2 __m128d _mm_cmple_pd(__m128d a, __m128d b)
673 {
674     return (__m128d)__builtin_ia32_cmplepd((__v2df)a, (__v2df)b);
675 }
676 
677 __INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
678 {
679     return (__m128d)__builtin_ia32_cmpltpd((__v2df)b, (__v2df)a);
680 }
681 
682 __INTRIN_INLINE_SSE2 __m128d _mm_cmpge_pd(__m128d a, __m128d b)
683 {
684     return (__m128d)__builtin_ia32_cmplepd((__v2df)b, (__v2df)a);
685 }
686 
687 __INTRIN_INLINE_SSE2 __m128d _mm_cmpord_pd(__m128d a, __m128d b)
688 {
689     return (__m128d)__builtin_ia32_cmpordpd((__v2df)a, (__v2df)b);
690 }
691 
692 __INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
693 {
694     return (__m128d)__builtin_ia32_cmpunordpd((__v2df)a, (__v2df)b);
695 }
696 
697 __INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
698 {
699     return (__m128d)__builtin_ia32_cmpneqpd((__v2df)a, (__v2df)b);
700 }
701 
702 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
703 {
704     return (__m128d)__builtin_ia32_cmpnltpd((__v2df)a, (__v2df)b);
705 }
706 
707 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
708 {
709     return (__m128d)__builtin_ia32_cmpnlepd((__v2df)a, (__v2df)b);
710 }
711 
712 __INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
713 {
714     return (__m128d)__builtin_ia32_cmpnltpd((__v2df)b, (__v2df)a);
715 }
716 
717 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
718 {
719     return (__m128d)__builtin_ia32_cmpnlepd((__v2df)b, (__v2df)a);
720 }
721 
722 __INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
723 {
724     return (__m128d)__builtin_ia32_cmpeqsd((__v2df)a, (__v2df)b);
725 }
726 
727 __INTRIN_INLINE_SSE2 __m128d _mm_cmplt_sd(__m128d a, __m128d b)
728 {
729     return (__m128d)__builtin_ia32_cmpltsd((__v2df)a, (__v2df)b);
730 }
731 
732 __INTRIN_INLINE_SSE2 __m128d _mm_cmple_sd(__m128d a, __m128d b)
733 {
734     return (__m128d)__builtin_ia32_cmplesd((__v2df)a, (__v2df)b);
735 }
736 
737 __INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
738 {
739     __m128d __c = __builtin_ia32_cmpltsd((__v2df)b, (__v2df)a);
740     return __extension__(__m128d){__c[0], a[1]};
741 }
742 
743 __INTRIN_INLINE_SSE2 __m128d _mm_cmpge_sd(__m128d a, __m128d b)
744 {
745     __m128d __c = __builtin_ia32_cmplesd((__v2df)b, (__v2df)a);
746     return __extension__(__m128d){__c[0], a[1]};
747 }
748 
749 __INTRIN_INLINE_SSE2 __m128d _mm_cmpord_sd(__m128d a, __m128d b)
750 {
751     return (__m128d)__builtin_ia32_cmpordsd((__v2df)a, (__v2df)b);
752 }
753 
754 __INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
755 {
756     return (__m128d)__builtin_ia32_cmpunordsd((__v2df)a, (__v2df)b);
757 }
758 
759 __INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
760 {
761     return (__m128d)__builtin_ia32_cmpneqsd((__v2df)a, (__v2df)b);
762 }
763 
764 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
765 {
766     return (__m128d)__builtin_ia32_cmpnltsd((__v2df)a, (__v2df)b);
767 }
768 
769 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
770 {
771     return (__m128d)__builtin_ia32_cmpnlesd((__v2df)a, (__v2df)b);
772 }
773 
774 __INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
775 {
776     __m128d __c = __builtin_ia32_cmpnltsd((__v2df)b, (__v2df)a);
777     return __extension__(__m128d){__c[0], a[1]};
778 }
779 
780 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
781 {
782     __m128d __c = __builtin_ia32_cmpnlesd((__v2df)b, (__v2df)a);
783     return __extension__(__m128d){__c[0], a[1]};
784 }
785 
786 __INTRIN_INLINE_SSE2 int _mm_comieq_sd(__m128d a, __m128d b)
787 {
788     return __builtin_ia32_comisdeq((__v2df)a, (__v2df)b);
789 }
790 
791 __INTRIN_INLINE_SSE2 int _mm_comilt_sd(__m128d a, __m128d b)
792 {
793     return __builtin_ia32_comisdlt((__v2df)a, (__v2df)b);
794 }
795 
796 __INTRIN_INLINE_SSE2 int _mm_comile_sd(__m128d a, __m128d b)
797 {
798     return __builtin_ia32_comisdle((__v2df)a, (__v2df)b);
799 }
800 
801 __INTRIN_INLINE_SSE2 int _mm_comigt_sd(__m128d a, __m128d b)
802 {
803     return __builtin_ia32_comisdgt((__v2df)a, (__v2df)b);
804 }
805 
806 __INTRIN_INLINE_SSE2 int _mm_comige_sd(__m128d a, __m128d b)
807 {
808     return __builtin_ia32_comisdge((__v2df)a, (__v2df)b);
809 }
810 
811 __INTRIN_INLINE_SSE2 int _mm_comineq_sd(__m128d a, __m128d b)
812 {
813     return __builtin_ia32_comisdneq((__v2df)a, (__v2df)b);
814 }
815 
816 __INTRIN_INLINE_SSE2 int _mm_ucomieq_sd(__m128d a, __m128d b)
817 {
818     return __builtin_ia32_ucomisdeq((__v2df)a, (__v2df)b);
819 }
820 
821 __INTRIN_INLINE_SSE2 int _mm_ucomilt_sd(__m128d a, __m128d b)
822 {
823     return __builtin_ia32_ucomisdlt((__v2df)a, (__v2df)b);
824 }
825 
826 __INTRIN_INLINE_SSE2 int _mm_ucomile_sd(__m128d a, __m128d b)
827 {
828     return __builtin_ia32_ucomisdle((__v2df)a, (__v2df)b);
829 }
830 
831 __INTRIN_INLINE_SSE2 int _mm_ucomigt_sd(__m128d a, __m128d b)
832 {
833     return __builtin_ia32_ucomisdgt((__v2df)a, (__v2df)b);
834 }
835 
836 __INTRIN_INLINE_SSE2 int _mm_ucomige_sd(__m128d a, __m128d b)
837 {
838     return __builtin_ia32_ucomisdge((__v2df)a, (__v2df)b);
839 }
840 
841 __INTRIN_INLINE_SSE2 int _mm_ucomineq_sd(__m128d a, __m128d b)
842 {
843     return __builtin_ia32_ucomisdneq((__v2df)a, (__v2df)b);
844 }
845 
846 __INTRIN_INLINE_SSE2 __m128 _mm_cvtpd_ps(__m128d a)
847 {
848     return __builtin_ia32_cvtpd2ps((__v2df)a);
849 }
850 
851 __INTRIN_INLINE_SSE2 __m128d _mm_cvtps_pd(__m128 a)
852 {
853 #if HAS_BUILTIN(__builtin_convertvector)
854     return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4sf)a, (__v4sf)a, 0, 1), __v2df);
855 #else
856     return __builtin_ia32_cvtps2pd(a);
857 #endif
858 }
859 
860 __INTRIN_INLINE_SSE2 __m128d _mm_cvtepi32_pd(__m128i a)
861 {
862 #if HAS_BUILTIN(__builtin_convertvector)
863     return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4si)a, (__v4si)a, 0, 1), __v2df);
864 #else
865     return __builtin_ia32_cvtdq2pd((__v4si)a);
866 #endif
867 }
868 
869 __INTRIN_INLINE_SSE2 __m128i _mm_cvtpd_epi32(__m128d a)
870 {
871     return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)a);
872 }
873 
874 __INTRIN_INLINE_SSE2 int _mm_cvtsd_si32(__m128d a)
875 {
876     return __builtin_ia32_cvtsd2si((__v2df)a);
877 }
878 
879 __INTRIN_INLINE_SSE2 __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
880 {
881     return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)a, (__v2df)b);
882 }
883 
884 __INTRIN_INLINE_SSE2 __m128d _mm_cvtsi32_sd(__m128d a,
885                                                               int b)
886 {
887     a[0] = b;
888     return a;
889 }
890 
891 __INTRIN_INLINE_SSE2 __m128d _mm_cvtss_sd(__m128d a, __m128 b)
892 {
893     a[0] = b[0];
894     return a;
895 }
896 
897 __INTRIN_INLINE_SSE2 __m128i _mm_cvttpd_epi32(__m128d a)
898 {
899     return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)a);
900 }
901 
902 __INTRIN_INLINE_SSE2 int _mm_cvttsd_si32(__m128d a)
903 {
904     return __builtin_ia32_cvttsd2si((__v2df)a);
905 }
906 
907 __INTRIN_INLINE_MMX __m64 _mm_cvtpd_pi32(__m128d a)
908 {
909     return (__m64)__builtin_ia32_cvtpd2pi((__v2df)a);
910 }
911 
912 __INTRIN_INLINE_MMX __m64 _mm_cvttpd_pi32(__m128d a)
913 {
914     return (__m64)__builtin_ia32_cvttpd2pi((__v2df)a);
915 }
916 
917 __INTRIN_INLINE_SSE __m128d _mm_cvtpi32_pd(__m64 a)
918 {
919     return __builtin_ia32_cvtpi2pd((__v2si)a);
920 }
921 
922 __INTRIN_INLINE_SSE2 double _mm_cvtsd_f64(__m128d a)
923 {
924     return a[0];
925 }
926 
927 __INTRIN_INLINE_SSE2 __m128d _mm_load_pd(double const *dp)
928 {
929     return *(const __m128d *)dp;
930 }
931 
932 __INTRIN_INLINE_SSE2 __m128d _mm_load1_pd(double const *dp)
933 {
934     struct __mm_load1_pd_struct {
935       double __u;
936     } __attribute__((__packed__, __may_alias__));
937     double __u = ((const struct __mm_load1_pd_struct *)dp)->__u;
938     return __extension__(__m128d){__u, __u};
939 }
940 
941 // GCC:
942 /* Create a selector for use with the SHUFPD instruction.  */
943 #define _MM_SHUFFLE2(fp1,fp0) \
944  (((fp1) << 1) | (fp0))
945 
946 __INTRIN_INLINE_SSE2 __m128d _mm_loadr_pd(double const *dp)
947 {
948 #if HAS_BUILTIN(__builtin_shufflevector)
949     __m128d u = *(const __m128d *)dp;
950     return __builtin_shufflevector((__v2df)u, (__v2df)u, 1, 0);
951 #else
952     return (__m128d){ dp[1], dp[0] };
953 #endif
954 }
955 
956 __INTRIN_INLINE_SSE2 __m128d _mm_loadu_pd(double const *dp)
957 {
958     struct __loadu_pd {
959       __m128d_u __v;
960     } __attribute__((__packed__, __may_alias__));
961     return ((const struct __loadu_pd *)dp)->__v;
962 }
963 
964 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si64(void const *a)
965 {
966     struct __loadu_si64 {
967       long long __v;
968     } __attribute__((__packed__, __may_alias__));
969     long long __u = ((const struct __loadu_si64 *)a)->__v;
970     return __extension__(__m128i)(__v2di){__u, 0LL};
971 }
972 
973 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si32(void const *a)
974 {
975     struct __loadu_si32 {
976       int __v;
977     } __attribute__((__packed__, __may_alias__));
978     int __u = ((const struct __loadu_si32 *)a)->__v;
979     return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
980 }
981 
982 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si16(void const *a)
983 {
984     struct __loadu_si16 {
985       short __v;
986     } __attribute__((__packed__, __may_alias__));
987     short __u = ((const struct __loadu_si16 *)a)->__v;
988     return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
989 }
990 
991 __INTRIN_INLINE_SSE2 __m128d _mm_load_sd(double const *dp)
992 {
993     struct __mm_load_sd_struct {
994       double __u;
995     } __attribute__((__packed__, __may_alias__));
996     double __u = ((const struct __mm_load_sd_struct *)dp)->__u;
997     return __extension__(__m128d){__u, 0};
998 }
999 
1000 __INTRIN_INLINE_SSE2 __m128d _mm_loadh_pd(__m128d a, double const *dp)
1001 {
1002     struct __mm_loadh_pd_struct {
1003       double __u;
1004     } __attribute__((__packed__, __may_alias__));
1005     double __u = ((const struct __mm_loadh_pd_struct *)dp)->__u;
1006     return __extension__(__m128d){a[0], __u};
1007 }
1008 
1009 __INTRIN_INLINE_SSE2 __m128d _mm_loadl_pd(__m128d a, double const *dp)
1010 {
1011     struct __mm_loadl_pd_struct {
1012       double __u;
1013     } __attribute__((__packed__, __may_alias__));
1014     double __u = ((const struct __mm_loadl_pd_struct *)dp)->__u;
1015     return __extension__(__m128d){__u, a[1]};
1016 }
1017 
1018 __INTRIN_INLINE_SSE2 __m128d _mm_undefined_pd(void)
1019 {
1020 #if HAS_BUILTIN(__builtin_ia32_undef128)
1021     return (__m128d)__builtin_ia32_undef128();
1022 #else
1023     __m128d undef = undef;
1024     return undef;
1025 #endif
1026 }
1027 
1028 __INTRIN_INLINE_SSE2 __m128d _mm_set_sd(double w)
1029 {
1030     return __extension__(__m128d){w, 0};
1031 }
1032 
1033 __INTRIN_INLINE_SSE2 __m128d _mm_set1_pd(double w)
1034 {
1035     return __extension__(__m128d){w, w};
1036 }
1037 
1038 __INTRIN_INLINE_SSE2 __m128d _mm_set_pd(double w, double x)
1039 {
1040     return __extension__(__m128d){x, w};
1041 }
1042 
1043 __INTRIN_INLINE_SSE2 __m128d _mm_setr_pd(double w, double x)
1044 {
1045     return __extension__(__m128d){w, x};
1046 }
1047 
1048 __INTRIN_INLINE_SSE2 __m128d _mm_setzero_pd(void)
1049 {
1050     return __extension__(__m128d){0, 0};
1051 }
1052 
1053 __INTRIN_INLINE_SSE2 __m128d _mm_move_sd(__m128d a, __m128d b)
1054 {
1055     a[0] = b[0];
1056     return a;
1057 }
1058 
1059 __INTRIN_INLINE_SSE2 void _mm_store_sd(double *dp, __m128d a)
1060 {
1061     struct __mm_store_sd_struct {
1062       double __u;
1063     } __attribute__((__packed__, __may_alias__));
1064     ((struct __mm_store_sd_struct *)dp)->__u = a[0];
1065 }
1066 
1067 __INTRIN_INLINE_SSE2 void _mm_store_pd(double *dp, __m128d a)
1068 {
1069     *(__m128d *)dp = a;
1070 }
1071 
1072 __INTRIN_INLINE_SSE2 void _mm_store1_pd(double *dp, __m128d a)
1073 {
1074 #if HAS_BUILTIN(__builtin_shufflevector)
1075     a = __builtin_shufflevector((__v2df)a, (__v2df)a, 0, 0);
1076     _mm_store_pd(dp, a);
1077 #else
1078     dp[0] = a[0];
1079     dp[1] = a[0];
1080 #endif
1081 }
1082 
1083 __INTRIN_INLINE_SSE2 void _mm_storeu_pd(double *dp, __m128d a)
1084 {
1085     struct __storeu_pd {
1086       __m128d_u __v;
1087     } __attribute__((__packed__, __may_alias__));
1088     ((struct __storeu_pd *)dp)->__v = a;
1089 }
1090 
1091 __INTRIN_INLINE_SSE2 void _mm_storer_pd(double *dp, __m128d a)
1092 {
1093 #if HAS_BUILTIN(__builtin_shufflevector)
1094     a = __builtin_shufflevector((__v2df)a, (__v2df)a, 1, 0);
1095     *(__m128d *)dp = a;
1096 #else
1097     dp[0] = a[1];
1098     dp[1] = a[0];
1099 #endif
1100 }
1101 
1102 __INTRIN_INLINE_SSE2 void _mm_storeh_pd(double *dp, __m128d a)
1103 {
1104     struct __mm_storeh_pd_struct {
1105       double __u;
1106     } __attribute__((__packed__, __may_alias__));
1107     ((struct __mm_storeh_pd_struct *)dp)->__u = a[1];
1108 }
1109 
1110 __INTRIN_INLINE_SSE2 void _mm_storel_pd(double *dp, __m128d a)
1111 {
1112     struct __mm_storeh_pd_struct {
1113       double __u;
1114     } __attribute__((__packed__, __may_alias__));
1115     ((struct __mm_storeh_pd_struct *)dp)->__u = a[0];
1116 }
1117 
1118 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi8(__m128i a, __m128i b)
1119 {
1120     return (__m128i)((__v16qu)a + (__v16qu)b);
1121 }
1122 
1123 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi16(__m128i a, __m128i b)
1124 {
1125     return (__m128i)((__v8hu)a + (__v8hu)b);
1126 }
1127 
1128 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi32(__m128i a, __m128i b)
1129 {
1130     return (__m128i)((__v4su)a + (__v4su)b);
1131 }
1132 
1133 __INTRIN_INLINE_MMX __m64 _mm_add_si64(__m64 a, __m64 b)
1134 {
1135     return (__m64)__builtin_ia32_paddq((__v1di)a, (__v1di)b);
1136 }
1137 
1138 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi64(__m128i a, __m128i b)
1139 {
1140     return (__m128i)((__v2du)a + (__v2du)b);
1141 }
1142 
1143 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epi8(__m128i a, __m128i b)
1144 {
1145 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1146     return (__m128i)__builtin_elementwise_add_sat((__v16qs)a, (__v16qs)b);
1147 #else
1148     return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
1149 #endif
1150 }
1151 
1152 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epi16(__m128i a, __m128i b)
1153 {
1154 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1155     return (__m128i)__builtin_elementwise_add_sat((__v8hi)a, (__v8hi)b);
1156 #else
1157     return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
1158 #endif
1159 }
1160 
1161 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epu8(__m128i a, __m128i b)
1162 {
1163 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1164     return (__m128i)__builtin_elementwise_add_sat((__v16qu)a, (__v16qu)b);
1165 #else
1166     return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
1167 #endif
1168 }
1169 
1170 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epu16(__m128i a, __m128i b)
1171 {
1172 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1173     return (__m128i)__builtin_elementwise_add_sat((__v8hu)a, (__v8hu)b);
1174 #else
1175     return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
1176 #endif
1177 }
1178 
1179 __INTRIN_INLINE_SSE2 __m128i _mm_avg_epu8(__m128i a, __m128i b)
1180 {
1181     return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
1182 }
1183 
1184 __INTRIN_INLINE_SSE2 __m128i _mm_avg_epu16(__m128i a, __m128i b)
1185 {
1186     return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
1187 }
1188 
1189 __INTRIN_INLINE_SSE2 __m128i _mm_madd_epi16(__m128i a, __m128i b)
1190 {
1191     return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
1192 }
1193 
1194 __INTRIN_INLINE_SSE2 __m128i _mm_max_epi16(__m128i a, __m128i b)
1195 {
1196 #if HAS_BUILTIN(__builtin_elementwise_max)
1197     return (__m128i)__builtin_elementwise_max((__v8hi)a, (__v8hi)b);
1198 #else
1199     return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
1200 #endif
1201 }
1202 
1203 __INTRIN_INLINE_SSE2 __m128i _mm_max_epu8(__m128i a, __m128i b)
1204 {
1205 #if HAS_BUILTIN(__builtin_elementwise_max)
1206     return (__m128i)__builtin_elementwise_max((__v16qu)a, (__v16qu)b);
1207 #else
1208     return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
1209 #endif
1210 }
1211 
1212 __INTRIN_INLINE_SSE2 __m128i _mm_min_epi16(__m128i a, __m128i b)
1213 {
1214 #if HAS_BUILTIN(__builtin_elementwise_min)
1215     return (__m128i)__builtin_elementwise_min((__v8hi)a, (__v8hi)b);
1216 #else
1217     return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
1218 #endif
1219 }
1220 
1221 __INTRIN_INLINE_SSE2 __m128i _mm_min_epu8(__m128i a, __m128i b)
1222 {
1223 #if HAS_BUILTIN(__builtin_elementwise_min)
1224     return (__m128i)__builtin_elementwise_min((__v16qu)a, (__v16qu)b);
1225 #else
1226     return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
1227 #endif
1228 }
1229 
1230 __INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
1231 {
1232     return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
1233 }
1234 
1235 __INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
1236 {
1237     return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
1238 }
1239 
1240 __INTRIN_INLINE_SSE2 __m128i _mm_mullo_epi16(__m128i a, __m128i b)
1241 {
1242     return (__m128i)((__v8hu)a * (__v8hu)b);
1243 }
1244 
1245 __INTRIN_INLINE_MMX __m64 _mm_mul_su32(__m64 a, __m64 b)
1246 {
1247     return (__m64)__builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
1248 }
1249 
1250 __INTRIN_INLINE_SSE2 __m128i _mm_mul_epu32(__m128i a, __m128i b)
1251 {
1252     return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
1253 }
1254 
1255 __INTRIN_INLINE_SSE2 __m128i _mm_sad_epu8(__m128i a, __m128i b)
1256 {
1257     return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
1258 }
1259 
1260 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi8(__m128i a, __m128i b)
1261 {
1262     return (__m128i)((__v16qu)a - (__v16qu)b);
1263 }
1264 
1265 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi16(__m128i a, __m128i b)
1266 {
1267     return (__m128i)((__v8hu)a - (__v8hu)b);
1268 }
1269 
1270 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi32(__m128i a, __m128i b)
1271 {
1272     return (__m128i)((__v4su)a - (__v4su)b);
1273 }
1274 
1275 __INTRIN_INLINE_MMX __m64 _mm_sub_si64(__m64 a, __m64 b)
1276 {
1277     return (__m64)__builtin_ia32_psubq((__v1di)a, (__v1di)b);
1278 }
1279 
1280 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi64(__m128i a, __m128i b)
1281 {
1282     return (__m128i)((__v2du)a - (__v2du)b);
1283 }
1284 
1285 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epi8(__m128i a, __m128i b)
1286 {
1287 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1288     return (__m128i)__builtin_elementwise_sub_sat((__v16qs)a, (__v16qs)b);
1289 #else
1290     return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
1291 #endif
1292 }
1293 
1294 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epi16(__m128i a, __m128i b)
1295 {
1296 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1297     return (__m128i)__builtin_elementwise_sub_sat((__v8hi)a, (__v8hi)b);
1298 #else
1299     return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
1300 #endif
1301 }
1302 
1303 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epu8(__m128i a, __m128i b)
1304 {
1305 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1306     return (__m128i)__builtin_elementwise_sub_sat((__v16qu)a, (__v16qu)b);
1307 #else
1308     return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
1309 #endif
1310 }
1311 
1312 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epu16(__m128i a, __m128i b)
1313 {
1314 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1315     return (__m128i)__builtin_elementwise_sub_sat((__v8hu)a, (__v8hu)b);
1316 #else
1317     return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
1318 #endif
1319 }
1320 
1321 __INTRIN_INLINE_SSE2 __m128i _mm_and_si128(__m128i a, __m128i b)
1322 {
1323     return (__m128i)((__v2du)a & (__v2du)b);
1324 }
1325 
1326 __INTRIN_INLINE_SSE2 __m128i _mm_andnot_si128(__m128i a, __m128i b)
1327 {
1328     return (__m128i)(~(__v2du)a & (__v2du)b);
1329 }
1330 
1331 __INTRIN_INLINE_SSE2 __m128i _mm_or_si128(__m128i a, __m128i b)
1332 {
1333     return (__m128i)((__v2du)a | (__v2du)b);
1334 }
1335 
1336 __INTRIN_INLINE_SSE2 __m128i _mm_xor_si128(__m128i a, __m128i b)
1337 {
1338     return (__m128i)((__v2du)a ^ (__v2du)b);
1339 }
1340 
1341 #define _mm_slli_si128(a, imm) \
1342     ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
1343 
1344 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi16(__m128i a, int count)
1345 {
1346     return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
1347 }
1348 
1349 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi16(__m128i a, __m128i count)
1350 {
1351     return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
1352 }
1353 
1354 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi32(__m128i a, int count)
1355 {
1356     return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
1357 }
1358 
1359 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi32(__m128i a, __m128i count)
1360 {
1361     return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
1362 }
1363 
1364 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi64(__m128i a, int count)
1365 {
1366     return __builtin_ia32_psllqi128((__v2di)a, count);
1367 }
1368 
1369 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi64(__m128i a, __m128i count)
1370 {
1371     return __builtin_ia32_psllq128((__v2di)a, (__v2di)count);
1372 }
1373 
1374 __INTRIN_INLINE_SSE2 __m128i _mm_srai_epi16(__m128i a, int count)
1375 {
1376     return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
1377 }
1378 
1379 __INTRIN_INLINE_SSE2 __m128i _mm_sra_epi16(__m128i a, __m128i count)
1380 {
1381     return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
1382 }
1383 
1384 __INTRIN_INLINE_SSE2 __m128i _mm_srai_epi32(__m128i a, int count)
1385 {
1386     return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
1387 }
1388 
1389 __INTRIN_INLINE_SSE2 __m128i _mm_sra_epi32(__m128i a, __m128i count)
1390 {
1391     return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
1392 }
1393 
1394 #define _mm_srli_si128(a, imm) \
1395     ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
1396 
1397 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi16(__m128i a, int count)
1398 {
1399     return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
1400 }
1401 
1402 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi16(__m128i a, __m128i count)
1403 {
1404     return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
1405 }
1406 
1407 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi32(__m128i a, int count)
1408 {
1409     return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
1410 }
1411 
1412 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi32(__m128i a, __m128i count)
1413 {
1414     return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
1415 }
1416 
1417 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi64(__m128i a, int count)
1418 {
1419     return __builtin_ia32_psrlqi128((__v2di)a, count);
1420 }
1421 
1422 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi64(__m128i a, __m128i count)
1423 {
1424     return __builtin_ia32_psrlq128((__v2di)a, (__v2di)count);
1425 }
1426 
1427 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
1428 {
1429     return (__m128i)((__v16qi)a == (__v16qi)b);
1430 }
1431 
1432 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
1433 {
1434     return (__m128i)((__v8hi)a == (__v8hi)b);
1435 }
1436 
1437 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
1438 {
1439     return (__m128i)((__v4si)a == (__v4si)b);
1440 }
1441 
1442 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
1443 {
1444     /* This function always performs a signed comparison, but __v16qi is a char
1445        which may be signed or unsigned, so use __v16qs. */
1446     return (__m128i)((__v16qs)a > (__v16qs)b);
1447 }
1448 
1449 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
1450 {
1451     return (__m128i)((__v8hi)a > (__v8hi)b);
1452 }
1453 
1454 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
1455 {
1456     return (__m128i)((__v4si)a > (__v4si)b);
1457 }
1458 
1459 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
1460 {
1461     return _mm_cmpgt_epi8(b, a);
1462 }
1463 
1464 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
1465 {
1466     return _mm_cmpgt_epi16(b, a);
1467 }
1468 
1469 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
1470 {
1471     return _mm_cmpgt_epi32(b, a);
1472 }
1473 
1474 #ifdef _M_AMD64
1475 
1476 __INTRIN_INLINE_SSE2 __m128d _mm_cvtsi64_sd(__m128d a, long long b)
1477 {
1478     a[0] = b;
1479     return a;
1480 }
1481 
1482 __INTRIN_INLINE_SSE2 long long _mm_cvtsd_si64(__m128d a)
1483 {
1484     return __builtin_ia32_cvtsd2si64((__v2df)a);
1485 }
1486 
1487 __INTRIN_INLINE_SSE2 long long _mm_cvttsd_si64(__m128d a)
1488 {
1489     return __builtin_ia32_cvttsd2si64((__v2df)a);
1490 }
1491 #endif
1492 
1493 __INTRIN_INLINE_SSE2 __m128 _mm_cvtepi32_ps(__m128i a)
1494 {
1495 #if HAS_BUILTIN(__builtin_convertvector)
1496     return (__m128)__builtin_convertvector((__v4si)a, __v4sf);
1497 #else
1498     return __builtin_ia32_cvtdq2ps((__v4si)a);
1499 #endif
1500 }
1501 
1502 __INTRIN_INLINE_SSE2 __m128i _mm_cvtps_epi32(__m128 a)
1503 {
1504     return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)a);
1505 }
1506 
1507 __INTRIN_INLINE_SSE2 __m128i _mm_cvttps_epi32(__m128 a)
1508 {
1509     return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)a);
1510 }
1511 
1512 __INTRIN_INLINE_SSE2 __m128i _mm_cvtsi32_si128(int a)
1513 {
1514     return __extension__(__m128i)(__v4si){a, 0, 0, 0};
1515 }
1516 
1517 __INTRIN_INLINE_SSE2 __m128i _mm_cvtsi64_si128(long long a)
1518 {
1519     return __extension__(__m128i)(__v2di){a, 0};
1520 }
1521 
1522 __INTRIN_INLINE_SSE2 int _mm_cvtsi128_si32(__m128i a)
1523 {
1524     __v4si b = (__v4si)a;
1525     return b[0];
1526 }
1527 
1528 __INTRIN_INLINE_SSE2 long long _mm_cvtsi128_si64(__m128i a)
1529 {
1530     return a[0];
1531 }
1532 
1533 __INTRIN_INLINE_SSE2 __m128i _mm_load_si128(__m128i const *p)
1534 {
1535     return *p;
1536 }
1537 
1538 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si128(__m128i_u const *p)
1539 {
1540     struct __loadu_si128 {
1541       __m128i_u __v;
1542     } __attribute__((__packed__, __may_alias__));
1543     return ((const struct __loadu_si128 *)p)->__v;
1544 }
1545 
1546 __INTRIN_INLINE_SSE2 __m128i _mm_loadl_epi64(__m128i_u const *p)
1547 {
1548     struct __mm_loadl_epi64_struct {
1549       long long __u;
1550     } __attribute__((__packed__, __may_alias__));
1551     return __extension__(__m128i){
1552         ((const struct __mm_loadl_epi64_struct *)p)->__u, 0};
1553 }
1554 
1555 __INTRIN_INLINE_SSE2 __m128i _mm_undefined_si128(void)
1556 {
1557 #if HAS_BUILTIN(__builtin_ia32_undef128)
1558     return (__m128i)__builtin_ia32_undef128();
1559 #else
1560     __m128i undef = undef;
1561     return undef;
1562 #endif
1563 }
1564 
1565 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi64x(long long q1, long long q0)
1566 {
1567     return __extension__(__m128i)(__v2di){q0, q1};
1568 }
1569 
1570 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi64(__m64 q1, __m64 q0)
1571 {
1572     return _mm_set_epi64x((long long)q1, (long long)q0);
1573 }
1574 
1575 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
1576 {
1577     return __extension__(__m128i)(__v4si){i0, i1, i2, i3};
1578 }
1579 
1580 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi16(
1581     short w7, short w6, short w5, short w4,
1582     short w3, short w2, short w1, short w0)
1583 {
1584     return __extension__(__m128i)(__v8hi){w0, w1, w2, w3, w4, w5, w6, w7};
1585 }
1586 
1587 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi8(
1588     char b15, char b14, char b13, char b12,
1589     char b11, char b10, char b9, char b8,
1590     char b7, char b6, char b5, char b4,
1591     char b3, char b2, char b1, char b0)
1592 {
1593     return __extension__(__m128i)(__v16qi){
1594         b0, b1, b2,  b3,  b4,  b5,  b6,  b7,
1595         b8, b9, b10, b11, b12, b13, b14, b15};
1596 }
1597 
1598 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64x(long long q)
1599 {
1600     return _mm_set_epi64x(q, q);
1601 }
1602 
1603 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64(__m64 q)
1604 {
1605     return _mm_set_epi64(q, q);
1606 }
1607 
1608 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi32(int i)
1609 {
1610     return _mm_set_epi32(i, i, i, i);
1611 }
1612 
1613 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi16(short w)
1614 {
1615     return _mm_set_epi16(w, w, w, w, w, w, w, w);
1616 }
1617 
1618 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi8(char b)
1619 {
1620     return _mm_set_epi8(b, b, b, b, b, b, b, b, b, b, b,
1621                         b, b, b, b, b);
1622 }
1623 
1624 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi64(__m64 q0, __m64 q1)
1625 {
1626     return _mm_set_epi64(q1, q0);
1627 }
1628 
1629 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3)
1630 {
1631     return _mm_set_epi32(i3, i2, i1, i0);
1632 }
1633 
1634 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi16(
1635     short w0, short w1, short w2, short w3,
1636     short w4, short w5, short w6, short w7)
1637 {
1638     return _mm_set_epi16(w7, w6, w5, w4, w3, w2, w1, w0);
1639 }
1640 
1641 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi8(
1642     char b0, char b1, char b2, char b3,
1643     char b4, char b5, char b6, char b7,
1644     char b8, char b9, char b10,  char b11,
1645     char b12, char b13, char b14, char b15)
1646 {
1647     return _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8,
1648                         b7, b6, b5, b4, b3, b2, b1, b0);
1649 }
1650 
1651 __INTRIN_INLINE_SSE2 __m128i _mm_setzero_si128(void)
1652 {
1653     return __extension__(__m128i)(__v2di){0LL, 0LL};
1654 }
1655 
1656 __INTRIN_INLINE_SSE2 void _mm_store_si128(__m128i *p, __m128i b)
1657 {
1658     *p = b;
1659 }
1660 
1661 __INTRIN_INLINE_SSE2 void _mm_storeu_si128(__m128i_u *p, __m128i b)
1662 {
1663     struct __storeu_si128 {
1664       __m128i_u __v;
1665     } __attribute__((__packed__, __may_alias__));
1666     ((struct __storeu_si128 *)p)->__v = b;
1667 }
1668 
1669 __INTRIN_INLINE_SSE2 void _mm_storeu_si64(void *p, __m128i b)
1670 {
1671     struct __storeu_si64 {
1672       long long __v;
1673     } __attribute__((__packed__, __may_alias__));
1674     ((struct __storeu_si64 *)p)->__v = ((__v2di)b)[0];
1675 }
1676 
1677 __INTRIN_INLINE_SSE2 void _mm_storeu_si32(void *p, __m128i b)
1678 {
1679     struct __storeu_si32 {
1680       int __v;
1681     } __attribute__((__packed__, __may_alias__));
1682     ((struct __storeu_si32 *)p)->__v = ((__v4si)b)[0];
1683 }
1684 
1685 __INTRIN_INLINE_SSE2 void _mm_storeu_si16(void *p, __m128i b)
1686 {
1687     struct __storeu_si16 {
1688       short __v;
1689     } __attribute__((__packed__, __may_alias__));
1690     ((struct __storeu_si16 *)p)->__v = ((__v8hi)b)[0];
1691 }
1692 
1693 __INTRIN_INLINE_SSE2 void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1694 {
1695     __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1696 }
1697 
1698 __INTRIN_INLINE_SSE2 void _mm_storel_epi64(__m128i_u *p, __m128i a)
1699 {
1700     struct __mm_storel_epi64_struct {
1701       long long __u;
1702     } __attribute__((__packed__, __may_alias__));
1703     ((struct __mm_storel_epi64_struct *)p)->__u = a[0];
1704 }
1705 
1706 __INTRIN_INLINE_SSE2 void _mm_stream_pd(double *p, __m128d a)
1707 {
1708 #if HAS_BUILTIN(__builtin_nontemporal_store)
1709     __builtin_nontemporal_store((__v2df)a, (__v2df *)p);
1710 #else
1711     __builtin_ia32_movntpd(p, a);
1712 #endif
1713 }
1714 
1715 __INTRIN_INLINE_SSE2 void _mm_stream_si128(__m128i *p, __m128i a)
1716 {
1717 #if HAS_BUILTIN(__builtin_nontemporal_store)
1718     __builtin_nontemporal_store((__v2di)a, (__v2di*)p);
1719 #else
1720     __builtin_ia32_movntdq(p, a);
1721 #endif
1722 }
1723 
1724 __INTRIN_INLINE_SSE2 void _mm_stream_si32(int *p, int a)
1725 {
1726     __builtin_ia32_movnti(p, a);
1727 }
1728 
1729 #ifdef _M_AMD64
1730 __INTRIN_INLINE_SSE2 void _mm_stream_si64(long long *p, long long a)
1731 {
1732     __builtin_ia32_movnti64(p, a);
1733 }
1734 #endif
1735 
1736 void _mm_clflush(void const *p);
1737 
1738 void _mm_lfence(void);
1739 
1740 void _mm_mfence(void);
1741 
1742 __INTRIN_INLINE_SSE2 __m128i _mm_packs_epi16(__m128i a, __m128i b)
1743 {
1744     return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1745 }
1746 
1747 __INTRIN_INLINE_SSE2 __m128i _mm_packs_epi32(__m128i a, __m128i b)
1748 {
1749     return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1750 }
1751 
1752 __INTRIN_INLINE_SSE2 __m128i _mm_packus_epi16(__m128i a, __m128i b)
1753 {
1754     return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1755 }
1756 
1757 #define _mm_extract_epi16(a, imm)                                              \
1758     ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
1759                                                       (int)(imm)))
1760 
1761 #define _mm_insert_epi16(a, b, imm)                                            \
1762     ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
1763                                           (int)(imm)))
1764 
1765 __INTRIN_INLINE_SSE2 int _mm_movemask_epi8(__m128i a)
1766 {
1767     return __builtin_ia32_pmovmskb128((__v16qi)a);
1768 }
1769 
1770 #define _mm_shuffle_epi32(a, imm)                                              \
1771     ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
1772 
1773 #define _mm_shufflelo_epi16(a, imm)                                            \
1774     ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
1775 
1776 #define _mm_shufflehi_epi16(a, imm)                                            \
1777     ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
1778 
1779 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
1780 {
1781 #if HAS_BUILTIN(__builtin_shufflevector)
1782     return (__m128i)__builtin_shufflevector(
1783         (__v16qi)a, (__v16qi)b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
1784         16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
1785 #else
1786     return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)a, (__v16qi)b);
1787 #endif
1788 }
1789 
1790 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
1791 {
1792 #if HAS_BUILTIN(__builtin_shufflevector)
1793     return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8 + 4, 5,
1794                                             8 + 5, 6, 8 + 6, 7, 8 + 7);
1795 #else
1796     return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)a, (__v8hi)b);
1797 #endif
1798 }
1799 
1800 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
1801 {
1802 #if HAS_BUILTIN(__builtin_shufflevector)
1803     return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4 + 2, 3,
1804                                             4 + 3);
1805 #else
1806     return (__m128i)__builtin_ia32_punpckhdq128((__v4si)a, (__v4si)b);
1807 #endif
1808 }
1809 
1810 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
1811 {
1812 #if HAS_BUILTIN(__builtin_shufflevector)
1813     return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 1, 2 + 1);
1814 #else
1815     return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)a, (__v2di)b);
1816 #endif
1817 }
1818 
1819 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
1820 {
1821 #if HAS_BUILTIN(__builtin_shufflevector)
1822     return (__m128i)__builtin_shufflevector(
1823         (__v16qi)a, (__v16qi)b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
1824         16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
1825 #else
1826     return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)a, (__v16qi)b);
1827 #endif
1828 }
1829 
1830 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
1831 {
1832 #if HAS_BUILTIN(__builtin_shufflevector)
1833     return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8 + 0, 1,
1834                                             8 + 1, 2, 8 + 2, 3, 8 + 3);
1835 #else
1836     return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)a, (__v8hi)b);
1837 #endif
1838 }
1839 
1840 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
1841 {
1842 #if HAS_BUILTIN(__builtin_shufflevector)
1843     return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4 + 0, 1,
1844                                             4 + 1);
1845 #else
1846     return (__m128i)__builtin_ia32_punpckldq128((__v4si)a, (__v4si)b);
1847 #endif
1848 }
1849 
1850 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
1851 {
1852 #if HAS_BUILTIN(__builtin_shufflevector)
1853     return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 0, 2 + 0);
1854 #else
1855     return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)a, (__v2di)b);
1856 #endif
1857 }
1858 
1859 __INTRIN_INLINE_SSE2 __m64 _mm_movepi64_pi64(__m128i a)
1860 {
1861     return (__m64)a[0];
1862 }
1863 
1864 __INTRIN_INLINE_SSE2 __m128i _mm_movpi64_epi64(__m64 a)
1865 {
1866     return __extension__(__m128i)(__v2di){(long long)a, 0};
1867 }
1868 
1869 __INTRIN_INLINE_SSE2 __m128i _mm_move_epi64(__m128i a)
1870 {
1871 #if HAS_BUILTIN(__builtin_shufflevector)
1872     return __builtin_shufflevector((__v2di)a, _mm_setzero_si128(), 0, 2);
1873 #else
1874     return (__m128i)__builtin_ia32_movq128((__v2di)a);
1875 #endif
1876 }
1877 
1878 __INTRIN_INLINE_SSE2 __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
1879 {
1880 #if HAS_BUILTIN(__builtin_shufflevector)
1881     return __builtin_shufflevector((__v2df)a, (__v2df)b, 1, 2 + 1);
1882 #else
1883     return (__m128d)__builtin_ia32_unpckhpd((__v2df)a, (__v2df)b);
1884 #endif
1885 }
1886 
1887 __INTRIN_INLINE_SSE2 __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
1888 {
1889 #if HAS_BUILTIN(__builtin_shufflevector)
1890     return __builtin_shufflevector((__v2df)a, (__v2df)b, 0, 2 + 0);
1891 #else
1892     return (__m128d)__builtin_ia32_unpcklpd((__v2df)a, (__v2df)b);
1893 #endif
1894 }
1895 
1896 __INTRIN_INLINE_SSE2 int _mm_movemask_pd(__m128d a)
1897 {
1898     return __builtin_ia32_movmskpd((__v2df)a);
1899 }
1900 
1901 #define _mm_shuffle_pd(a, b, i)                                                \
1902     ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
1903                                     (int)(i)))
1904 
1905 __INTRIN_INLINE_SSE2 __m128 _mm_castpd_ps(__m128d a)
1906 {
1907     return (__m128)a;
1908 }
1909 
1910 __INTRIN_INLINE_SSE2 __m128i _mm_castpd_si128(__m128d a)
1911 {
1912     return (__m128i)a;
1913 }
1914 
1915 __INTRIN_INLINE_SSE2 __m128d _mm_castps_pd(__m128 a)
1916 {
1917     return (__m128d)a;
1918 }
1919 
1920 __INTRIN_INLINE_SSE2 __m128i _mm_castps_si128(__m128 a)
1921 {
1922     return (__m128i)a;
1923 }
1924 
1925 __INTRIN_INLINE_SSE2 __m128 _mm_castsi128_ps(__m128i a)
1926 {
1927     return (__m128)a;
1928 }
1929 
1930 __INTRIN_INLINE_SSE2 __m128d _mm_castsi128_pd(__m128i a)
1931 {
1932     return (__m128d)a;
1933 }
1934 
1935 void _mm_pause(void);
1936 
1937 #endif /* _MSC_VER */
1938 
1939 
1940 
1941 #endif /* _INCLUDED_EMM */
1942