1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #pragma once
11 #ifndef _INCLUDED_EMM
12 #define _INCLUDED_EMM
13
14 #include <vcruntime.h>
15 #include <xmmintrin.h>
16
17 #if defined(_MSC_VER) && !defined(__clang__)
18
19 typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128i
20 {
21 __int8 m128i_i8[16];
22 __int16 m128i_i16[8];
23 __int32 m128i_i32[4];
24 __int64 m128i_i64[2];
25 unsigned __int8 m128i_u8[16];
26 unsigned __int16 m128i_u16[8];
27 unsigned __int32 m128i_u32[4];
28 unsigned __int64 m128i_u64[2];
29 } __m128i;
30 _STATIC_ASSERT(sizeof(__m128i) == 16);
31
32 typedef struct _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128d
33 {
34 double m128d_f64[2];
35 } __m128d;
36
37 typedef __declspec(align(1)) __m128i __m128i_u;
38
39 #define __ATTRIBUTE_SSE2__
40
41 #else /* _MSC_VER */
42
43 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
45
46 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
47 typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
48
49 /* Type defines. */
50 typedef double __v2df __attribute__((__vector_size__(16)));
51 typedef long long __v2di __attribute__((__vector_size__(16)));
52 typedef short __v8hi __attribute__((__vector_size__(16)));
53 typedef char __v16qi __attribute__((__vector_size__(16)));
54
55 /* Unsigned types */
56 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
57 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
58 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
59
60 /* We need an explicitly signed variant for char. Note that this shouldn't
61 * appear in the interface though. */
62 typedef signed char __v16qs __attribute__((__vector_size__(16)));
63
64 #ifdef __clang__
65 #define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"),__min_vector_width__(128)))
66 #define __ATTRIBUTE_MMXSSE2__ __attribute__((__target__("mmx,sse2"),__min_vector_width__(128)))
67 #else
68 #define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2")))
69 #define __ATTRIBUTE_MMXSSE2__ __attribute__((__target__("mmx,sse2")))
70 #endif
71 #define __INTRIN_INLINE_SSE2 __INTRIN_INLINE __ATTRIBUTE_SSE2__
72 #define __INTRIN_INLINE_MMXSSE2 __INTRIN_INLINE __ATTRIBUTE_MMXSSE2__
73
74 #endif /* _MSC_VER */
75
76 #ifdef __cplusplus
77 extern "C" {
78 #endif
79
80 extern __m128d _mm_add_sd(__m128d a, __m128d b);
81 extern __m128d _mm_add_pd(__m128d a, __m128d b);
82 extern __m128d _mm_sub_sd(__m128d a, __m128d b);
83 extern __m128d _mm_sub_pd(__m128d a, __m128d b);
84 extern __m128d _mm_mul_sd(__m128d a, __m128d b);
85 extern __m128d _mm_mul_pd(__m128d a, __m128d b);
86 extern __m128d _mm_div_sd(__m128d a, __m128d b);
87 extern __m128d _mm_div_pd(__m128d a, __m128d b);
88 extern __m128d _mm_sqrt_sd(__m128d a, __m128d b);
89 extern __m128d _mm_sqrt_pd(__m128d a);
90 extern __m128d _mm_min_sd(__m128d a, __m128d b);
91 extern __m128d _mm_min_pd(__m128d a, __m128d b);
92 extern __m128d _mm_max_sd(__m128d a, __m128d b);
93 extern __m128d _mm_max_pd(__m128d a, __m128d b);
94 extern __m128d _mm_and_pd(__m128d a, __m128d b);
95 extern __m128d _mm_andnot_pd(__m128d a, __m128d b);
96 extern __m128d _mm_or_pd(__m128d a, __m128d b);
97 extern __m128d _mm_xor_pd(__m128d a, __m128d b);
98 extern __m128d _mm_cmpeq_pd(__m128d a, __m128d b);
99 extern __m128d _mm_cmplt_pd(__m128d a, __m128d b);
100 extern __m128d _mm_cmple_pd(__m128d a, __m128d b);
101 extern __m128d _mm_cmpgt_pd(__m128d a, __m128d b);
102 extern __m128d _mm_cmpge_pd(__m128d a, __m128d b);
103 extern __m128d _mm_cmpord_pd(__m128d a, __m128d b);
104 extern __m128d _mm_cmpunord_pd(__m128d a, __m128d b);
105 extern __m128d _mm_cmpneq_pd(__m128d a, __m128d b);
106 extern __m128d _mm_cmpnlt_pd(__m128d a, __m128d b);
107 extern __m128d _mm_cmpnle_pd(__m128d a, __m128d b);
108 extern __m128d _mm_cmpngt_pd(__m128d a, __m128d b);
109 extern __m128d _mm_cmpnge_pd(__m128d a, __m128d b);
110 extern __m128d _mm_cmpeq_sd(__m128d a, __m128d b);
111 extern __m128d _mm_cmplt_sd(__m128d a, __m128d b);
112 extern __m128d _mm_cmple_sd(__m128d a, __m128d b);
113 extern __m128d _mm_cmpgt_sd(__m128d a, __m128d b);
114 extern __m128d _mm_cmpge_sd(__m128d a, __m128d b);
115 extern __m128d _mm_cmpord_sd(__m128d a, __m128d b);
116 extern __m128d _mm_cmpunord_sd(__m128d a, __m128d b);
117 extern __m128d _mm_cmpneq_sd(__m128d a, __m128d b);
118 extern __m128d _mm_cmpnlt_sd(__m128d a, __m128d b);
119 extern __m128d _mm_cmpnle_sd(__m128d a, __m128d b);
120 extern __m128d _mm_cmpngt_sd(__m128d a, __m128d b);
121 extern __m128d _mm_cmpnge_sd(__m128d a, __m128d b);
122 extern int _mm_comieq_sd(__m128d a, __m128d b);
123 extern int _mm_comilt_sd(__m128d a, __m128d b);
124 extern int _mm_comile_sd(__m128d a, __m128d b);
125 extern int _mm_comigt_sd(__m128d a, __m128d b);
126 extern int _mm_comige_sd(__m128d a, __m128d b);
127 extern int _mm_comineq_sd(__m128d a, __m128d b);
128 extern int _mm_ucomieq_sd(__m128d a, __m128d b);
129 extern int _mm_ucomilt_sd(__m128d a, __m128d b);
130 extern int _mm_ucomile_sd(__m128d a, __m128d b);
131 extern int _mm_ucomigt_sd(__m128d a, __m128d b);
132 extern int _mm_ucomige_sd(__m128d a, __m128d b);
133 extern int _mm_ucomineq_sd(__m128d a, __m128d b);
134 extern __m128 _mm_cvtpd_ps(__m128d a);
135 extern __m128d _mm_cvtps_pd(__m128 a);
136 extern __m128d _mm_cvtepi32_pd(__m128i a);
137 extern __m128i _mm_cvtpd_epi32(__m128d a);
138 extern int _mm_cvtsd_si32(__m128d a);
139 extern __m128 _mm_cvtsd_ss(__m128 a, __m128d b);
140 extern __m128d _mm_cvtsi32_sd(__m128d a, int b);
141 extern __m128d _mm_cvtss_sd(__m128d a, __m128 b);
142 extern __m128i _mm_cvttpd_epi32(__m128d a);
143 extern int _mm_cvttsd_si32(__m128d a);
144 extern __m64 _mm_cvtpd_pi32(__m128d a);
145 extern __m64 _mm_cvttpd_pi32(__m128d a);
146 extern __m128d _mm_cvtpi32_pd(__m64 a);
147 extern double _mm_cvtsd_f64(__m128d a);
148 extern __m128d _mm_load_pd(double const *dp);
149 extern __m128d _mm_load1_pd(double const *dp);
150 extern __m128d _mm_loadr_pd(double const *dp);
151 extern __m128d _mm_loadu_pd(double const *dp);
152 //extern __m128i _mm_loadu_si64(void const *a);
153 //extern __m128i _mm_loadu_si32(void const *a);
154 //extern __m128i _mm_loadu_si16(void const *a);
155 extern __m128d _mm_load_sd(double const *dp);
156 extern __m128d _mm_loadh_pd(__m128d a, double const *dp);
157 extern __m128d _mm_loadl_pd(__m128d a, double const *dp);
158 //extern __m128d _mm_undefined_pd(void);
159 extern __m128d _mm_set_sd(double w);
160 extern __m128d _mm_set1_pd(double w);
161 extern __m128d _mm_set_pd(double w, double x);
162 extern __m128d _mm_setr_pd(double w, double x);
163 extern __m128d _mm_setzero_pd(void);
164 extern __m128d _mm_move_sd(__m128d a, __m128d b);
165 extern void _mm_store_sd(double *dp, __m128d a);
166 extern void _mm_store_pd(double *dp, __m128d a);
167 extern void _mm_store1_pd(double *dp, __m128d a);
168 extern void _mm_storeu_pd(double *dp, __m128d a);
169 extern void _mm_storer_pd(double *dp, __m128d a);
170 extern void _mm_storeh_pd(double *dp, __m128d a);
171 extern void _mm_storel_pd(double *dp, __m128d a);
172 extern __m128i _mm_add_epi8(__m128i a, __m128i b);
173 extern __m128i _mm_add_epi16(__m128i a, __m128i b);
174 extern __m128i _mm_add_epi32(__m128i a, __m128i b);
175 extern __m64 _mm_add_si64(__m64 a, __m64 b);
176 extern __m128i _mm_add_epi64(__m128i a, __m128i b);
177 extern __m128i _mm_adds_epi8(__m128i a, __m128i b);
178 extern __m128i _mm_adds_epi16(__m128i a, __m128i b);
179 extern __m128i _mm_adds_epu8(__m128i a, __m128i b);
180 extern __m128i _mm_adds_epu16(__m128i a, __m128i b);
181 extern __m128i _mm_avg_epu8(__m128i a, __m128i b);
182 extern __m128i _mm_avg_epu16(__m128i a, __m128i b);
183 extern __m128i _mm_madd_epi16(__m128i a, __m128i b);
184 extern __m128i _mm_max_epi16(__m128i a, __m128i b);
185 extern __m128i _mm_max_epu8(__m128i a, __m128i b);
186 extern __m128i _mm_min_epi16(__m128i a, __m128i b);
187 extern __m128i _mm_min_epu8(__m128i a, __m128i b);
188 extern __m128i _mm_mulhi_epi16(__m128i a, __m128i b);
189 extern __m128i _mm_mulhi_epu16(__m128i a, __m128i b);
190 extern __m128i _mm_mullo_epi16(__m128i a, __m128i b);
191 extern __m64 _mm_mul_su32(__m64 a, __m64 b);
192 extern __m128i _mm_mul_epu32(__m128i a, __m128i b);
193 extern __m128i _mm_sad_epu8(__m128i a, __m128i b);
194 extern __m128i _mm_sub_epi8(__m128i a, __m128i b);
195 extern __m128i _mm_sub_epi16(__m128i a, __m128i b);
196 extern __m128i _mm_sub_epi32(__m128i a, __m128i b);
197 extern __m64 _mm_sub_si64(__m64 a, __m64 b);
198 extern __m128i _mm_sub_epi64(__m128i a, __m128i b);
199 extern __m128i _mm_subs_epi8(__m128i a, __m128i b);
200 extern __m128i _mm_subs_epi16(__m128i a, __m128i b);
201 extern __m128i _mm_subs_epu8(__m128i a, __m128i b);
202 extern __m128i _mm_subs_epu16(__m128i a, __m128i b);
203 extern __m128i _mm_and_si128(__m128i a, __m128i b);
204 extern __m128i _mm_andnot_si128(__m128i a, __m128i b);
205 extern __m128i _mm_or_si128(__m128i a, __m128i b);
206 extern __m128i _mm_xor_si128(__m128i a, __m128i b);
207 extern __m128i _mm_slli_si128(__m128i a, int i);
208 extern __m128i _mm_slli_epi16(__m128i a, int count);
209 extern __m128i _mm_sll_epi16(__m128i a, __m128i count);
210 extern __m128i _mm_slli_epi32(__m128i a, int count);
211 extern __m128i _mm_sll_epi32(__m128i a, __m128i count);
212 extern __m128i _mm_slli_epi64(__m128i a, int count);
213 extern __m128i _mm_sll_epi64(__m128i a, __m128i count);
214 extern __m128i _mm_srai_epi16(__m128i a, int count);
215 extern __m128i _mm_sra_epi16(__m128i a, __m128i count);
216 extern __m128i _mm_srai_epi32(__m128i a, int count);
217 extern __m128i _mm_sra_epi32(__m128i a, __m128i count);
218 extern __m128i _mm_srli_si128(__m128i a, int imm);
219 extern __m128i _mm_srli_epi16(__m128i a, int count);
220 extern __m128i _mm_srl_epi16(__m128i a, __m128i count);
221 extern __m128i _mm_srli_epi32(__m128i a, int count);
222 extern __m128i _mm_srl_epi32(__m128i a, __m128i count);
223 extern __m128i _mm_srli_epi64(__m128i a, int count);
224 extern __m128i _mm_srl_epi64(__m128i a, __m128i count);
225 extern __m128i _mm_cmpeq_epi8(__m128i a, __m128i b);
226 extern __m128i _mm_cmpeq_epi16(__m128i a, __m128i b);
227 extern __m128i _mm_cmpeq_epi32(__m128i a, __m128i b);
228 extern __m128i _mm_cmpgt_epi8(__m128i a, __m128i b);
229 extern __m128i _mm_cmpgt_epi16(__m128i a, __m128i b);
230 extern __m128i _mm_cmpgt_epi32(__m128i a, __m128i b);
231 extern __m128i _mm_cmplt_epi8(__m128i a, __m128i b);
232 extern __m128i _mm_cmplt_epi16(__m128i a, __m128i b);
233 extern __m128i _mm_cmplt_epi32(__m128i a, __m128i b);
234 #ifdef _M_AMD64
235 extern __m128d _mm_cvtsi64_sd(__m128d a, long long b);
236 extern long long _mm_cvtsd_si64(__m128d a);
237 extern long long _mm_cvttsd_si64(__m128d a);
238 #endif
239 extern __m128 _mm_cvtepi32_ps(__m128i a);
240 extern __m128i _mm_cvtps_epi32(__m128 a);
241 extern __m128i _mm_cvttps_epi32(__m128 a);
242 extern __m128i _mm_cvtsi32_si128(int a);
243 #ifdef _M_AMD64
244 extern __m128i _mm_cvtsi64_si128(long long a);
245 #endif
246 extern int _mm_cvtsi128_si32(__m128i a);
247 #ifdef _M_AMD64
248 extern long long _mm_cvtsi128_si64(__m128i a);
249 #endif
250 extern __m128i _mm_load_si128(__m128i const *p);
251 extern __m128i _mm_loadu_si128(__m128i_u const *p);
252 extern __m128i _mm_loadl_epi64(__m128i_u const *p);
253 //extern __m128i _mm_undefined_si128(void);
254 //extern __m128i _mm_set_epi64x(long long q1, long long q0); // FIXME
255 extern __m128i _mm_set_epi64(__m64 q1, __m64 q0);
256 //extern __m128i _mm_set_epi32(int i3, int i1, int i0);
257 extern __m128i _mm_set_epi32(int i3, int i2, int i1, int i0);
258 //extern __m128i _mm_set_epi16(short w7, short w2, short w1, short w0);
259 extern __m128i _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0);
260 //extern __m128i _mm_set_epi8(char b15, char b10, char b4, char b3, char b2, char b1, char b0);
261 extern __m128i _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
262 //extern __m128i _mm_set1_epi64x(long long q); // FIXME
263 extern __m128i _mm_set1_epi64(__m64 q);
264 extern __m128i _mm_set1_epi32(int i);
265 extern __m128i _mm_set1_epi16(short w);
266 extern __m128i _mm_set1_epi8(char b);
267 extern __m128i _mm_setl_epi64(__m128i q); // FIXME: clang?
268 extern __m128i _mm_setr_epi64(__m64 q0, __m64 q1);
269 //extern __m128i _mm_setr_epi32(int i0, int i2, int i3);
270 extern __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3);
271 //extern __m128i _mm_setr_epi16(short w0, short w5, short w6, short w7);
272 extern __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7);
273 //extern __m128i _mm_setr_epi8(char b0, char b6, char b11, char b12, char b13, char b14, char b15);
274 extern __m128i _mm_setr_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
275 extern __m128i _mm_setzero_si128(void);
276 extern void _mm_store_si128(__m128i *p, __m128i b);
277 extern void _mm_storeu_si128(__m128i_u *p, __m128i b);
278 //extern void _mm_storeu_si64(void *p, __m128i b);
279 //extern void _mm_storeu_si32(void *p, __m128i b);
280 //extern void _mm_storeu_si16(void *p, __m128i b);
281 extern void _mm_maskmoveu_si128(__m128i d, __m128i n, _Out_writes_bytes_(16) char *p);
282 extern void _mm_storel_epi64(__m128i_u *p, __m128i a);
283 extern void _mm_stream_pd(double *p, __m128d a);
284 extern void _mm_stream_si128(__m128i *p, __m128i a);
285 extern void _mm_stream_si32(int *p, int a);
286 extern void _mm_clflush(void const *p);
287 extern void _mm_lfence(void);
288 extern void _mm_mfence(void);
289 extern __m128i _mm_packs_epi16(__m128i a, __m128i b);
290 extern __m128i _mm_packs_epi32(__m128i a, __m128i b);
291 extern __m128i _mm_packus_epi16(__m128i a, __m128i b);
292 extern int _mm_extract_epi16(__m128i a, int imm);
293 extern __m128i _mm_insert_epi16(__m128i a, int b, int imm);
294 extern int _mm_movemask_epi8(__m128i a);
295 extern __m128i _mm_shuffle_epi32(__m128i a, int imm);
296 extern __m128i _mm_shufflelo_epi16(__m128i a, int imm);
297 extern __m128i _mm_shufflehi_epi16(__m128i a, int imm);
298 extern __m128i _mm_unpackhi_epi8(__m128i a, __m128i b);
299 extern __m128i _mm_unpackhi_epi16(__m128i a, __m128i b);
300 extern __m128i _mm_unpackhi_epi32(__m128i a, __m128i b);
301 extern __m128i _mm_unpackhi_epi64(__m128i a, __m128i b);
302 extern __m128i _mm_unpacklo_epi8(__m128i a, __m128i b);
303 extern __m128i _mm_unpacklo_epi16(__m128i a, __m128i b);
304 extern __m128i _mm_unpacklo_epi32(__m128i a, __m128i b);
305 extern __m128i _mm_unpacklo_epi64(__m128i a, __m128i b);
306 extern __m64 _mm_movepi64_pi64(__m128i a);
307 extern __m128i _mm_movpi64_epi64(__m64 a);
308 extern __m128i _mm_move_epi64(__m128i a);
309 extern __m128d _mm_unpackhi_pd(__m128d a, __m128d b);
310 extern __m128d _mm_unpacklo_pd(__m128d a, __m128d b);
311 extern int _mm_movemask_pd(__m128d a);
312 extern __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm);
313 extern __m128 _mm_castpd_ps(__m128d a);
314 extern __m128i _mm_castpd_si128(__m128d a);
315 extern __m128d _mm_castps_pd(__m128 a);
316 extern __m128i _mm_castps_si128(__m128 a);
317 extern __m128 _mm_castsi128_ps(__m128i a);
318 extern __m128d _mm_castsi128_pd(__m128i a);
319 void _mm_pause(void);
320
321 /* Alternate names */
322 #define _mm_set_pd1(a) _mm_set1_pd(a)
323 #define _mm_load_pd1(p) _mm_load1_pd(p)
324 #define _mm_store_pd1(p, a) _mm_store1_pd((p), (a))
325 #define _mm_bslli_si128 _mm_slli_si128
326 #define _mm_bsrli_si128 _mm_srli_si128
327 #define _mm_stream_si64 _mm_stream_si64x
328
329 #if defined(_MSC_VER) && !defined(__clang__)
330
331 #pragma intrinsic(_mm_add_sd)
332 #pragma intrinsic(_mm_add_pd)
333 #pragma intrinsic(_mm_sub_sd)
334 #pragma intrinsic(_mm_sub_pd)
335 #pragma intrinsic(_mm_mul_sd)
336 #pragma intrinsic(_mm_mul_pd)
337 #pragma intrinsic(_mm_div_sd)
338 #pragma intrinsic(_mm_div_pd)
339 #pragma intrinsic(_mm_sqrt_sd)
340 #pragma intrinsic(_mm_sqrt_pd)
341 #pragma intrinsic(_mm_min_sd)
342 #pragma intrinsic(_mm_min_pd)
343 #pragma intrinsic(_mm_max_sd)
344 #pragma intrinsic(_mm_max_pd)
345 #pragma intrinsic(_mm_and_pd)
346 #pragma intrinsic(_mm_andnot_pd)
347 #pragma intrinsic(_mm_or_pd)
348 #pragma intrinsic(_mm_xor_pd)
349 #pragma intrinsic(_mm_cmpeq_pd)
350 #pragma intrinsic(_mm_cmplt_pd)
351 #pragma intrinsic(_mm_cmple_pd)
352 #pragma intrinsic(_mm_cmpgt_pd)
353 #pragma intrinsic(_mm_cmpge_pd)
354 #pragma intrinsic(_mm_cmpord_pd)
355 #pragma intrinsic(_mm_cmpunord_pd)
356 #pragma intrinsic(_mm_cmpneq_pd)
357 #pragma intrinsic(_mm_cmpnlt_pd)
358 #pragma intrinsic(_mm_cmpnle_pd)
359 #pragma intrinsic(_mm_cmpngt_pd)
360 #pragma intrinsic(_mm_cmpnge_pd)
361 #pragma intrinsic(_mm_cmpeq_sd)
362 #pragma intrinsic(_mm_cmplt_sd)
363 #pragma intrinsic(_mm_cmple_sd)
364 #pragma intrinsic(_mm_cmpgt_sd)
365 #pragma intrinsic(_mm_cmpge_sd)
366 #pragma intrinsic(_mm_cmpord_sd)
367 #pragma intrinsic(_mm_cmpunord_sd)
368 #pragma intrinsic(_mm_cmpneq_sd)
369 #pragma intrinsic(_mm_cmpnlt_sd)
370 #pragma intrinsic(_mm_cmpnle_sd)
371 #pragma intrinsic(_mm_cmpngt_sd)
372 #pragma intrinsic(_mm_cmpnge_sd)
373 #pragma intrinsic(_mm_comieq_sd)
374 #pragma intrinsic(_mm_comilt_sd)
375 #pragma intrinsic(_mm_comile_sd)
376 #pragma intrinsic(_mm_comigt_sd)
377 #pragma intrinsic(_mm_comige_sd)
378 #pragma intrinsic(_mm_comineq_sd)
379 #pragma intrinsic(_mm_ucomieq_sd)
380 #pragma intrinsic(_mm_ucomilt_sd)
381 #pragma intrinsic(_mm_ucomile_sd)
382 #pragma intrinsic(_mm_ucomigt_sd)
383 #pragma intrinsic(_mm_ucomige_sd)
384 #pragma intrinsic(_mm_ucomineq_sd)
385 #pragma intrinsic(_mm_cvtpd_ps)
386 #pragma intrinsic(_mm_cvtps_pd)
387 #pragma intrinsic(_mm_cvtepi32_pd)
388 #pragma intrinsic(_mm_cvtpd_epi32)
389 #pragma intrinsic(_mm_cvtsd_si32)
390 #pragma intrinsic(_mm_cvtsd_ss)
391 #pragma intrinsic(_mm_cvtsi32_sd)
392 #pragma intrinsic(_mm_cvtss_sd)
393 #pragma intrinsic(_mm_cvttpd_epi32)
394 #pragma intrinsic(_mm_cvttsd_si32)
395 //#pragma intrinsic(_mm_cvtpd_pi32)
396 //#pragma intrinsic(_mm_cvttpd_pi32)
397 //#pragma intrinsic(_mm_cvtpi32_pd)
398 #pragma intrinsic(_mm_cvtsd_f64)
399 #pragma intrinsic(_mm_load_pd)
400 #pragma intrinsic(_mm_load1_pd)
401 #pragma intrinsic(_mm_loadr_pd)
402 #pragma intrinsic(_mm_loadu_pd)
403 //#pragma intrinsic(_mm_loadu_si64)
404 //#pragma intrinsic(_mm_loadu_si32)
405 //#pragma intrinsic(_mm_loadu_si16)
406 #pragma intrinsic(_mm_load_sd)
407 #pragma intrinsic(_mm_loadh_pd)
408 #pragma intrinsic(_mm_loadl_pd)
409 //#pragma intrinsic(_mm_undefined_pd)
410 #pragma intrinsic(_mm_set_sd)
411 #pragma intrinsic(_mm_set1_pd)
412 #pragma intrinsic(_mm_set_pd)
413 #pragma intrinsic(_mm_setr_pd)
414 #pragma intrinsic(_mm_setzero_pd)
415 #pragma intrinsic(_mm_move_sd)
416 #pragma intrinsic(_mm_store_sd)
417 #pragma intrinsic(_mm_store_pd)
418 #pragma intrinsic(_mm_store1_pd)
419 #pragma intrinsic(_mm_storeu_pd)
420 #pragma intrinsic(_mm_storer_pd)
421 #pragma intrinsic(_mm_storeh_pd)
422 #pragma intrinsic(_mm_storel_pd)
423 #pragma intrinsic(_mm_add_epi8)
424 #pragma intrinsic(_mm_add_epi16)
425 #pragma intrinsic(_mm_add_epi32)
426 //#pragma intrinsic(_mm_add_si64)
427 #pragma intrinsic(_mm_add_epi64)
428 #pragma intrinsic(_mm_adds_epi8)
429 #pragma intrinsic(_mm_adds_epi16)
430 #pragma intrinsic(_mm_adds_epu8)
431 #pragma intrinsic(_mm_adds_epu16)
432 #pragma intrinsic(_mm_avg_epu8)
433 #pragma intrinsic(_mm_avg_epu16)
434 #pragma intrinsic(_mm_madd_epi16)
435 #pragma intrinsic(_mm_max_epi16)
436 #pragma intrinsic(_mm_max_epu8)
437 #pragma intrinsic(_mm_min_epi16)
438 #pragma intrinsic(_mm_min_epu8)
439 #pragma intrinsic(_mm_mulhi_epi16)
440 #pragma intrinsic(_mm_mulhi_epu16)
441 #pragma intrinsic(_mm_mullo_epi16)
442 //#pragma intrinsic(_mm_mul_su32)
443 #pragma intrinsic(_mm_mul_epu32)
444 #pragma intrinsic(_mm_sad_epu8)
445 #pragma intrinsic(_mm_sub_epi8)
446 #pragma intrinsic(_mm_sub_epi16)
447 #pragma intrinsic(_mm_sub_epi32)
448 //#pragma intrinsic(_mm_sub_si64)
449 #pragma intrinsic(_mm_sub_epi64)
450 #pragma intrinsic(_mm_subs_epi8)
451 #pragma intrinsic(_mm_subs_epi16)
452 #pragma intrinsic(_mm_subs_epu8)
453 #pragma intrinsic(_mm_subs_epu16)
454 #pragma intrinsic(_mm_and_si128)
455 #pragma intrinsic(_mm_andnot_si128)
456 #pragma intrinsic(_mm_or_si128)
457 #pragma intrinsic(_mm_xor_si128)
458 #pragma intrinsic(_mm_slli_si128)
459 #pragma intrinsic(_mm_slli_epi16)
460 #pragma intrinsic(_mm_sll_epi16)
461 #pragma intrinsic(_mm_slli_epi32)
462 #pragma intrinsic(_mm_sll_epi32)
463 #pragma intrinsic(_mm_slli_epi64)
464 #pragma intrinsic(_mm_sll_epi64)
465 #pragma intrinsic(_mm_srai_epi16)
466 #pragma intrinsic(_mm_sra_epi16)
467 #pragma intrinsic(_mm_srai_epi32)
468 #pragma intrinsic(_mm_sra_epi32)
469 #pragma intrinsic(_mm_srli_si128)
470 #pragma intrinsic(_mm_srli_epi16)
471 #pragma intrinsic(_mm_srl_epi16)
472 #pragma intrinsic(_mm_srli_epi32)
473 #pragma intrinsic(_mm_srl_epi32)
474 #pragma intrinsic(_mm_srli_epi64)
475 #pragma intrinsic(_mm_srl_epi64)
476 #pragma intrinsic(_mm_cmpeq_epi8)
477 #pragma intrinsic(_mm_cmpeq_epi16)
478 #pragma intrinsic(_mm_cmpeq_epi32)
479 #pragma intrinsic(_mm_cmpgt_epi8)
480 #pragma intrinsic(_mm_cmpgt_epi16)
481 #pragma intrinsic(_mm_cmpgt_epi32)
482 #pragma intrinsic(_mm_cmplt_epi8)
483 #pragma intrinsic(_mm_cmplt_epi16)
484 #pragma intrinsic(_mm_cmplt_epi32)
485 #ifdef _M_AMD64
486 #pragma intrinsic(_mm_cvtsi64_sd)
487 #pragma intrinsic(_mm_cvtsd_si64)
488 #pragma intrinsic(_mm_cvttsd_si64)
489 #endif
490 #pragma intrinsic(_mm_cvtepi32_ps)
491 #pragma intrinsic(_mm_cvtps_epi32)
492 #pragma intrinsic(_mm_cvttps_epi32)
493 #pragma intrinsic(_mm_cvtsi32_si128)
494 #ifdef _M_AMD64
495 #pragma intrinsic(_mm_cvtsi64_si128)
496 #endif
497 #pragma intrinsic(_mm_cvtsi128_si32)
498 #ifdef _M_AMD64
499 #pragma intrinsic(_mm_cvtsi128_si64)
500 #endif
501 #pragma intrinsic(_mm_load_si128)
502 #pragma intrinsic(_mm_loadu_si128)
503 #pragma intrinsic(_mm_loadl_epi64)
504 //#pragma intrinsic(_mm_undefined_si128)
505 //#pragma intrinsic(_mm_set_epi64x)
506 //#pragma intrinsic(_mm_set_epi64)
507 #pragma intrinsic(_mm_set_epi32)
508 #pragma intrinsic(_mm_set_epi16)
509 #pragma intrinsic(_mm_set_epi8)
510 //#pragma intrinsic(_mm_set1_epi64x)
511 //#pragma intrinsic(_mm_set1_epi64)
512 #pragma intrinsic(_mm_set1_epi32)
513 #pragma intrinsic(_mm_set1_epi16)
514 #pragma intrinsic(_mm_set1_epi8)
515 #pragma intrinsic(_mm_setl_epi64)
516 //#pragma intrinsic(_mm_setr_epi64)
517 #pragma intrinsic(_mm_setr_epi32)
518 #pragma intrinsic(_mm_setr_epi16)
519 #pragma intrinsic(_mm_setr_epi8)
520 #pragma intrinsic(_mm_setzero_si128)
521 #pragma intrinsic(_mm_store_si128)
522 #pragma intrinsic(_mm_storeu_si128)
523 //#pragma intrinsic(_mm_storeu_si64)
524 //#pragma intrinsic(_mm_storeu_si32)
525 //#pragma intrinsic(_mm_storeu_si16)
526 #pragma intrinsic(_mm_maskmoveu_si128)
527 #pragma intrinsic(_mm_storel_epi64)
528 #pragma intrinsic(_mm_stream_pd)
529 #pragma intrinsic(_mm_stream_si128)
530 #pragma intrinsic(_mm_stream_si32)
531 #pragma intrinsic(_mm_clflush)
532 #pragma intrinsic(_mm_lfence)
533 #pragma intrinsic(_mm_mfence)
534 #pragma intrinsic(_mm_packs_epi16)
535 #pragma intrinsic(_mm_packs_epi32)
536 #pragma intrinsic(_mm_packus_epi16)
537 #pragma intrinsic(_mm_extract_epi16)
538 #pragma intrinsic(_mm_insert_epi16)
539 #pragma intrinsic(_mm_movemask_epi8)
540 #pragma intrinsic(_mm_shuffle_epi32)
541 #pragma intrinsic(_mm_shufflelo_epi16)
542 #pragma intrinsic(_mm_shufflehi_epi16)
543 #pragma intrinsic(_mm_unpackhi_epi8)
544 #pragma intrinsic(_mm_unpackhi_epi16)
545 #pragma intrinsic(_mm_unpackhi_epi32)
546 #pragma intrinsic(_mm_unpackhi_epi64)
547 #pragma intrinsic(_mm_unpacklo_epi8)
548 #pragma intrinsic(_mm_unpacklo_epi16)
549 #pragma intrinsic(_mm_unpacklo_epi32)
550 #pragma intrinsic(_mm_unpacklo_epi64)
551 //#pragma intrinsic(_mm_movepi64_pi64)
552 //#pragma intrinsic(_mm_movpi64_epi64)
553 #pragma intrinsic(_mm_move_epi64)
554 #pragma intrinsic(_mm_unpackhi_pd)
555 #pragma intrinsic(_mm_unpacklo_pd)
556 #pragma intrinsic(_mm_movemask_pd)
557 #pragma intrinsic(_mm_shuffle_pd)
558 #pragma intrinsic(_mm_castpd_ps)
559 #pragma intrinsic(_mm_castpd_si128)
560 #pragma intrinsic(_mm_castps_pd)
561 #pragma intrinsic(_mm_castps_si128)
562 #pragma intrinsic(_mm_castsi128_ps)
563 #pragma intrinsic(_mm_castsi128_pd)
564 #pragma intrinsic(_mm_pause)
565
566 #else /* _MSC_VER */
567
568 /*
569 Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/emmintrin.h
570 Clang older version: https://github.com/llvm/llvm-project/blob/3ef88b31843e040c95f23ff2c3c206f1fa399c05/clang/lib/Headers/emmintrin.h
571 unikraft: https://github.com/unikraft/lib-intel-intrinsics/blob/staging/include/emmintrin.h
572 */
573
_mm_add_sd(__m128d a,__m128d b)574 __INTRIN_INLINE_SSE2 __m128d _mm_add_sd(__m128d a, __m128d b)
575 {
576 a[0] += b[0];
577 return a;
578 }
579
_mm_add_pd(__m128d a,__m128d b)580 __INTRIN_INLINE_SSE2 __m128d _mm_add_pd(__m128d a, __m128d b)
581 {
582 return (__m128d)((__v2df)a + (__v2df)b);
583 }
584
_mm_sub_sd(__m128d a,__m128d b)585 __INTRIN_INLINE_SSE2 __m128d _mm_sub_sd(__m128d a, __m128d b)
586 {
587 a[0] -= b[0];
588 return a;
589 }
590
_mm_sub_pd(__m128d a,__m128d b)591 __INTRIN_INLINE_SSE2 __m128d _mm_sub_pd(__m128d a, __m128d b)
592 {
593 return (__m128d)((__v2df)a - (__v2df)b);
594 }
595
_mm_mul_sd(__m128d a,__m128d b)596 __INTRIN_INLINE_SSE2 __m128d _mm_mul_sd(__m128d a, __m128d b)
597 {
598 a[0] *= b[0];
599 return a;
600 }
601
_mm_mul_pd(__m128d a,__m128d b)602 __INTRIN_INLINE_SSE2 __m128d _mm_mul_pd(__m128d a, __m128d b)
603 {
604 return (__m128d)((__v2df)a * (__v2df)b);
605 }
606
_mm_div_sd(__m128d a,__m128d b)607 __INTRIN_INLINE_SSE2 __m128d _mm_div_sd(__m128d a, __m128d b)
608 {
609 a[0] /= b[0];
610 return a;
611 }
612
_mm_div_pd(__m128d a,__m128d b)613 __INTRIN_INLINE_SSE2 __m128d _mm_div_pd(__m128d a, __m128d b)
614 {
615 return (__m128d)((__v2df)a / (__v2df)b);
616 }
617
_mm_sqrt_sd(__m128d a,__m128d b)618 __INTRIN_INLINE_SSE2 __m128d _mm_sqrt_sd(__m128d a, __m128d b)
619 {
620 __m128d __c = __builtin_ia32_sqrtsd((__v2df)b);
621 return __extension__(__m128d){__c[0], a[1]};
622 }
623
_mm_sqrt_pd(__m128d a)624 __INTRIN_INLINE_SSE2 __m128d _mm_sqrt_pd(__m128d a)
625 {
626 return __builtin_ia32_sqrtpd((__v2df)a);
627 }
628
_mm_min_sd(__m128d a,__m128d b)629 __INTRIN_INLINE_SSE2 __m128d _mm_min_sd(__m128d a, __m128d b)
630 {
631 return __builtin_ia32_minsd((__v2df)a, (__v2df)b);
632 }
633
_mm_min_pd(__m128d a,__m128d b)634 __INTRIN_INLINE_SSE2 __m128d _mm_min_pd(__m128d a, __m128d b)
635 {
636 return __builtin_ia32_minpd((__v2df)a, (__v2df)b);
637 }
638
_mm_max_sd(__m128d a,__m128d b)639 __INTRIN_INLINE_SSE2 __m128d _mm_max_sd(__m128d a, __m128d b)
640 {
641 return __builtin_ia32_maxsd((__v2df)a, (__v2df)b);
642 }
643
_mm_max_pd(__m128d a,__m128d b)644 __INTRIN_INLINE_SSE2 __m128d _mm_max_pd(__m128d a, __m128d b)
645 {
646 return __builtin_ia32_maxpd((__v2df)a, (__v2df)b);
647 }
648
_mm_and_pd(__m128d a,__m128d b)649 __INTRIN_INLINE_SSE2 __m128d _mm_and_pd(__m128d a, __m128d b)
650 {
651 return (__m128d)((__v2du)a & (__v2du)b);
652 }
653
_mm_andnot_pd(__m128d a,__m128d b)654 __INTRIN_INLINE_SSE2 __m128d _mm_andnot_pd(__m128d a, __m128d b)
655 {
656 return (__m128d)(~(__v2du)a & (__v2du)b);
657 }
658
_mm_or_pd(__m128d a,__m128d b)659 __INTRIN_INLINE_SSE2 __m128d _mm_or_pd(__m128d a, __m128d b)
660 {
661 return (__m128d)((__v2du)a | (__v2du)b);
662 }
663
_mm_xor_pd(__m128d a,__m128d b)664 __INTRIN_INLINE_SSE2 __m128d _mm_xor_pd(__m128d a, __m128d b)
665 {
666 return (__m128d)((__v2du)a ^ (__v2du)b);
667 }
668
_mm_cmpeq_pd(__m128d a,__m128d b)669 __INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
670 {
671 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)a, (__v2df)b);
672 }
673
_mm_cmplt_pd(__m128d a,__m128d b)674 __INTRIN_INLINE_SSE2 __m128d _mm_cmplt_pd(__m128d a, __m128d b)
675 {
676 return (__m128d)__builtin_ia32_cmpltpd((__v2df)a, (__v2df)b);
677 }
678
_mm_cmple_pd(__m128d a,__m128d b)679 __INTRIN_INLINE_SSE2 __m128d _mm_cmple_pd(__m128d a, __m128d b)
680 {
681 return (__m128d)__builtin_ia32_cmplepd((__v2df)a, (__v2df)b);
682 }
683
_mm_cmpgt_pd(__m128d a,__m128d b)684 __INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
685 {
686 return (__m128d)__builtin_ia32_cmpltpd((__v2df)b, (__v2df)a);
687 }
688
_mm_cmpge_pd(__m128d a,__m128d b)689 __INTRIN_INLINE_SSE2 __m128d _mm_cmpge_pd(__m128d a, __m128d b)
690 {
691 return (__m128d)__builtin_ia32_cmplepd((__v2df)b, (__v2df)a);
692 }
693
_mm_cmpord_pd(__m128d a,__m128d b)694 __INTRIN_INLINE_SSE2 __m128d _mm_cmpord_pd(__m128d a, __m128d b)
695 {
696 return (__m128d)__builtin_ia32_cmpordpd((__v2df)a, (__v2df)b);
697 }
698
_mm_cmpunord_pd(__m128d a,__m128d b)699 __INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
700 {
701 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)a, (__v2df)b);
702 }
703
_mm_cmpneq_pd(__m128d a,__m128d b)704 __INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
705 {
706 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)a, (__v2df)b);
707 }
708
_mm_cmpnlt_pd(__m128d a,__m128d b)709 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
710 {
711 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)a, (__v2df)b);
712 }
713
_mm_cmpnle_pd(__m128d a,__m128d b)714 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
715 {
716 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)a, (__v2df)b);
717 }
718
_mm_cmpngt_pd(__m128d a,__m128d b)719 __INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
720 {
721 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)b, (__v2df)a);
722 }
723
_mm_cmpnge_pd(__m128d a,__m128d b)724 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
725 {
726 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)b, (__v2df)a);
727 }
728
_mm_cmpeq_sd(__m128d a,__m128d b)729 __INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
730 {
731 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)a, (__v2df)b);
732 }
733
_mm_cmplt_sd(__m128d a,__m128d b)734 __INTRIN_INLINE_SSE2 __m128d _mm_cmplt_sd(__m128d a, __m128d b)
735 {
736 return (__m128d)__builtin_ia32_cmpltsd((__v2df)a, (__v2df)b);
737 }
738
_mm_cmple_sd(__m128d a,__m128d b)739 __INTRIN_INLINE_SSE2 __m128d _mm_cmple_sd(__m128d a, __m128d b)
740 {
741 return (__m128d)__builtin_ia32_cmplesd((__v2df)a, (__v2df)b);
742 }
743
_mm_cmpgt_sd(__m128d a,__m128d b)744 __INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
745 {
746 __m128d __c = __builtin_ia32_cmpltsd((__v2df)b, (__v2df)a);
747 return __extension__(__m128d){__c[0], a[1]};
748 }
749
_mm_cmpge_sd(__m128d a,__m128d b)750 __INTRIN_INLINE_SSE2 __m128d _mm_cmpge_sd(__m128d a, __m128d b)
751 {
752 __m128d __c = __builtin_ia32_cmplesd((__v2df)b, (__v2df)a);
753 return __extension__(__m128d){__c[0], a[1]};
754 }
755
_mm_cmpord_sd(__m128d a,__m128d b)756 __INTRIN_INLINE_SSE2 __m128d _mm_cmpord_sd(__m128d a, __m128d b)
757 {
758 return (__m128d)__builtin_ia32_cmpordsd((__v2df)a, (__v2df)b);
759 }
760
_mm_cmpunord_sd(__m128d a,__m128d b)761 __INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
762 {
763 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)a, (__v2df)b);
764 }
765
_mm_cmpneq_sd(__m128d a,__m128d b)766 __INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
767 {
768 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)a, (__v2df)b);
769 }
770
_mm_cmpnlt_sd(__m128d a,__m128d b)771 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
772 {
773 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)a, (__v2df)b);
774 }
775
_mm_cmpnle_sd(__m128d a,__m128d b)776 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
777 {
778 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)a, (__v2df)b);
779 }
780
_mm_cmpngt_sd(__m128d a,__m128d b)781 __INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
782 {
783 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)b, (__v2df)a);
784 return __extension__(__m128d){__c[0], a[1]};
785 }
786
_mm_cmpnge_sd(__m128d a,__m128d b)787 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
788 {
789 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)b, (__v2df)a);
790 return __extension__(__m128d){__c[0], a[1]};
791 }
792
_mm_comieq_sd(__m128d a,__m128d b)793 __INTRIN_INLINE_SSE2 int _mm_comieq_sd(__m128d a, __m128d b)
794 {
795 return __builtin_ia32_comisdeq((__v2df)a, (__v2df)b);
796 }
797
_mm_comilt_sd(__m128d a,__m128d b)798 __INTRIN_INLINE_SSE2 int _mm_comilt_sd(__m128d a, __m128d b)
799 {
800 return __builtin_ia32_comisdlt((__v2df)a, (__v2df)b);
801 }
802
_mm_comile_sd(__m128d a,__m128d b)803 __INTRIN_INLINE_SSE2 int _mm_comile_sd(__m128d a, __m128d b)
804 {
805 return __builtin_ia32_comisdle((__v2df)a, (__v2df)b);
806 }
807
_mm_comigt_sd(__m128d a,__m128d b)808 __INTRIN_INLINE_SSE2 int _mm_comigt_sd(__m128d a, __m128d b)
809 {
810 return __builtin_ia32_comisdgt((__v2df)a, (__v2df)b);
811 }
812
_mm_comige_sd(__m128d a,__m128d b)813 __INTRIN_INLINE_SSE2 int _mm_comige_sd(__m128d a, __m128d b)
814 {
815 return __builtin_ia32_comisdge((__v2df)a, (__v2df)b);
816 }
817
_mm_comineq_sd(__m128d a,__m128d b)818 __INTRIN_INLINE_SSE2 int _mm_comineq_sd(__m128d a, __m128d b)
819 {
820 return __builtin_ia32_comisdneq((__v2df)a, (__v2df)b);
821 }
822
_mm_ucomieq_sd(__m128d a,__m128d b)823 __INTRIN_INLINE_SSE2 int _mm_ucomieq_sd(__m128d a, __m128d b)
824 {
825 return __builtin_ia32_ucomisdeq((__v2df)a, (__v2df)b);
826 }
827
_mm_ucomilt_sd(__m128d a,__m128d b)828 __INTRIN_INLINE_SSE2 int _mm_ucomilt_sd(__m128d a, __m128d b)
829 {
830 return __builtin_ia32_ucomisdlt((__v2df)a, (__v2df)b);
831 }
832
_mm_ucomile_sd(__m128d a,__m128d b)833 __INTRIN_INLINE_SSE2 int _mm_ucomile_sd(__m128d a, __m128d b)
834 {
835 return __builtin_ia32_ucomisdle((__v2df)a, (__v2df)b);
836 }
837
_mm_ucomigt_sd(__m128d a,__m128d b)838 __INTRIN_INLINE_SSE2 int _mm_ucomigt_sd(__m128d a, __m128d b)
839 {
840 return __builtin_ia32_ucomisdgt((__v2df)a, (__v2df)b);
841 }
842
_mm_ucomige_sd(__m128d a,__m128d b)843 __INTRIN_INLINE_SSE2 int _mm_ucomige_sd(__m128d a, __m128d b)
844 {
845 return __builtin_ia32_ucomisdge((__v2df)a, (__v2df)b);
846 }
847
_mm_ucomineq_sd(__m128d a,__m128d b)848 __INTRIN_INLINE_SSE2 int _mm_ucomineq_sd(__m128d a, __m128d b)
849 {
850 return __builtin_ia32_ucomisdneq((__v2df)a, (__v2df)b);
851 }
852
_mm_cvtpd_ps(__m128d a)853 __INTRIN_INLINE_SSE2 __m128 _mm_cvtpd_ps(__m128d a)
854 {
855 return __builtin_ia32_cvtpd2ps((__v2df)a);
856 }
857
_mm_cvtps_pd(__m128 a)858 __INTRIN_INLINE_SSE2 __m128d _mm_cvtps_pd(__m128 a)
859 {
860 #if HAS_BUILTIN(__builtin_convertvector)
861 return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4sf)a, (__v4sf)a, 0, 1), __v2df);
862 #else
863 return __builtin_ia32_cvtps2pd(a);
864 #endif
865 }
866
_mm_cvtepi32_pd(__m128i a)867 __INTRIN_INLINE_SSE2 __m128d _mm_cvtepi32_pd(__m128i a)
868 {
869 #if HAS_BUILTIN(__builtin_convertvector)
870 return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4si)a, (__v4si)a, 0, 1), __v2df);
871 #else
872 return __builtin_ia32_cvtdq2pd((__v4si)a);
873 #endif
874 }
875
_mm_cvtpd_epi32(__m128d a)876 __INTRIN_INLINE_SSE2 __m128i _mm_cvtpd_epi32(__m128d a)
877 {
878 return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)a);
879 }
880
_mm_cvtsd_si32(__m128d a)881 __INTRIN_INLINE_SSE2 int _mm_cvtsd_si32(__m128d a)
882 {
883 return __builtin_ia32_cvtsd2si((__v2df)a);
884 }
885
_mm_cvtsd_ss(__m128 a,__m128d b)886 __INTRIN_INLINE_SSE2 __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
887 {
888 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)a, (__v2df)b);
889 }
890
_mm_cvtsi32_sd(__m128d a,int b)891 __INTRIN_INLINE_SSE2 __m128d _mm_cvtsi32_sd(__m128d a,
892 int b)
893 {
894 a[0] = b;
895 return a;
896 }
897
_mm_cvtss_sd(__m128d a,__m128 b)898 __INTRIN_INLINE_SSE2 __m128d _mm_cvtss_sd(__m128d a, __m128 b)
899 {
900 a[0] = b[0];
901 return a;
902 }
903
_mm_cvttpd_epi32(__m128d a)904 __INTRIN_INLINE_SSE2 __m128i _mm_cvttpd_epi32(__m128d a)
905 {
906 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)a);
907 }
908
_mm_cvttsd_si32(__m128d a)909 __INTRIN_INLINE_SSE2 int _mm_cvttsd_si32(__m128d a)
910 {
911 return __builtin_ia32_cvttsd2si((__v2df)a);
912 }
913
_mm_cvtpd_pi32(__m128d a)914 __INTRIN_INLINE_MMXSSE2 __m64 _mm_cvtpd_pi32(__m128d a)
915 {
916 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)a);
917 }
918
_mm_cvttpd_pi32(__m128d a)919 __INTRIN_INLINE_MMXSSE2 __m64 _mm_cvttpd_pi32(__m128d a)
920 {
921 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)a);
922 }
923
_mm_cvtpi32_pd(__m64 a)924 __INTRIN_INLINE_MMXSSE2 __m128d _mm_cvtpi32_pd(__m64 a)
925 {
926 return __builtin_ia32_cvtpi2pd((__v2si)a);
927 }
928
_mm_cvtsd_f64(__m128d a)929 __INTRIN_INLINE_SSE2 double _mm_cvtsd_f64(__m128d a)
930 {
931 return a[0];
932 }
933
_mm_load_pd(double const * dp)934 __INTRIN_INLINE_SSE2 __m128d _mm_load_pd(double const *dp)
935 {
936 return *(const __m128d *)dp;
937 }
938
_mm_load1_pd(double const * dp)939 __INTRIN_INLINE_SSE2 __m128d _mm_load1_pd(double const *dp)
940 {
941 struct __mm_load1_pd_struct {
942 double __u;
943 } __attribute__((__packed__, __may_alias__));
944 double __u = ((const struct __mm_load1_pd_struct *)dp)->__u;
945 return __extension__(__m128d){__u, __u};
946 }
947
948 // GCC:
949 /* Create a selector for use with the SHUFPD instruction. */
950 #define _MM_SHUFFLE2(fp1,fp0) \
951 (((fp1) << 1) | (fp0))
952
_mm_loadr_pd(double const * dp)953 __INTRIN_INLINE_SSE2 __m128d _mm_loadr_pd(double const *dp)
954 {
955 #if HAS_BUILTIN(__builtin_shufflevector)
956 __m128d u = *(const __m128d *)dp;
957 return __builtin_shufflevector((__v2df)u, (__v2df)u, 1, 0);
958 #else
959 return (__m128d){ dp[1], dp[0] };
960 #endif
961 }
962
_mm_loadu_pd(double const * dp)963 __INTRIN_INLINE_SSE2 __m128d _mm_loadu_pd(double const *dp)
964 {
965 struct __loadu_pd {
966 __m128d_u __v;
967 } __attribute__((__packed__, __may_alias__));
968 return ((const struct __loadu_pd *)dp)->__v;
969 }
970
_mm_loadu_si64(void const * a)971 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si64(void const *a)
972 {
973 struct __loadu_si64 {
974 long long __v;
975 } __attribute__((__packed__, __may_alias__));
976 long long __u = ((const struct __loadu_si64 *)a)->__v;
977 return __extension__(__m128i)(__v2di){__u, 0LL};
978 }
979
_mm_loadu_si32(void const * a)980 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si32(void const *a)
981 {
982 struct __loadu_si32 {
983 int __v;
984 } __attribute__((__packed__, __may_alias__));
985 int __u = ((const struct __loadu_si32 *)a)->__v;
986 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
987 }
988
_mm_loadu_si16(void const * a)989 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si16(void const *a)
990 {
991 struct __loadu_si16 {
992 short __v;
993 } __attribute__((__packed__, __may_alias__));
994 short __u = ((const struct __loadu_si16 *)a)->__v;
995 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
996 }
997
_mm_load_sd(double const * dp)998 __INTRIN_INLINE_SSE2 __m128d _mm_load_sd(double const *dp)
999 {
1000 struct __mm_load_sd_struct {
1001 double __u;
1002 } __attribute__((__packed__, __may_alias__));
1003 double __u = ((const struct __mm_load_sd_struct *)dp)->__u;
1004 return __extension__(__m128d){__u, 0};
1005 }
1006
_mm_loadh_pd(__m128d a,double const * dp)1007 __INTRIN_INLINE_SSE2 __m128d _mm_loadh_pd(__m128d a, double const *dp)
1008 {
1009 struct __mm_loadh_pd_struct {
1010 double __u;
1011 } __attribute__((__packed__, __may_alias__));
1012 double __u = ((const struct __mm_loadh_pd_struct *)dp)->__u;
1013 return __extension__(__m128d){a[0], __u};
1014 }
1015
_mm_loadl_pd(__m128d a,double const * dp)1016 __INTRIN_INLINE_SSE2 __m128d _mm_loadl_pd(__m128d a, double const *dp)
1017 {
1018 struct __mm_loadl_pd_struct {
1019 double __u;
1020 } __attribute__((__packed__, __may_alias__));
1021 double __u = ((const struct __mm_loadl_pd_struct *)dp)->__u;
1022 return __extension__(__m128d){__u, a[1]};
1023 }
1024
_mm_undefined_pd(void)1025 __INTRIN_INLINE_SSE2 __m128d _mm_undefined_pd(void)
1026 {
1027 #if HAS_BUILTIN(__builtin_ia32_undef128)
1028 return (__m128d)__builtin_ia32_undef128();
1029 #else
1030 __m128d undef = undef;
1031 return undef;
1032 #endif
1033 }
1034
_mm_set_sd(double w)1035 __INTRIN_INLINE_SSE2 __m128d _mm_set_sd(double w)
1036 {
1037 return __extension__(__m128d){w, 0};
1038 }
1039
_mm_set1_pd(double w)1040 __INTRIN_INLINE_SSE2 __m128d _mm_set1_pd(double w)
1041 {
1042 return __extension__(__m128d){w, w};
1043 }
1044
_mm_set_pd(double w,double x)1045 __INTRIN_INLINE_SSE2 __m128d _mm_set_pd(double w, double x)
1046 {
1047 return __extension__(__m128d){x, w};
1048 }
1049
_mm_setr_pd(double w,double x)1050 __INTRIN_INLINE_SSE2 __m128d _mm_setr_pd(double w, double x)
1051 {
1052 return __extension__(__m128d){w, x};
1053 }
1054
_mm_setzero_pd(void)1055 __INTRIN_INLINE_SSE2 __m128d _mm_setzero_pd(void)
1056 {
1057 return __extension__(__m128d){0, 0};
1058 }
1059
_mm_move_sd(__m128d a,__m128d b)1060 __INTRIN_INLINE_SSE2 __m128d _mm_move_sd(__m128d a, __m128d b)
1061 {
1062 a[0] = b[0];
1063 return a;
1064 }
1065
_mm_store_sd(double * dp,__m128d a)1066 __INTRIN_INLINE_SSE2 void _mm_store_sd(double *dp, __m128d a)
1067 {
1068 struct __mm_store_sd_struct {
1069 double __u;
1070 } __attribute__((__packed__, __may_alias__));
1071 ((struct __mm_store_sd_struct *)dp)->__u = a[0];
1072 }
1073
_mm_store_pd(double * dp,__m128d a)1074 __INTRIN_INLINE_SSE2 void _mm_store_pd(double *dp, __m128d a)
1075 {
1076 *(__m128d *)dp = a;
1077 }
1078
_mm_store1_pd(double * dp,__m128d a)1079 __INTRIN_INLINE_SSE2 void _mm_store1_pd(double *dp, __m128d a)
1080 {
1081 #if HAS_BUILTIN(__builtin_shufflevector)
1082 a = __builtin_shufflevector((__v2df)a, (__v2df)a, 0, 0);
1083 _mm_store_pd(dp, a);
1084 #else
1085 dp[0] = a[0];
1086 dp[1] = a[0];
1087 #endif
1088 }
1089
_mm_storeu_pd(double * dp,__m128d a)1090 __INTRIN_INLINE_SSE2 void _mm_storeu_pd(double *dp, __m128d a)
1091 {
1092 struct __storeu_pd {
1093 __m128d_u __v;
1094 } __attribute__((__packed__, __may_alias__));
1095 ((struct __storeu_pd *)dp)->__v = a;
1096 }
1097
_mm_storer_pd(double * dp,__m128d a)1098 __INTRIN_INLINE_SSE2 void _mm_storer_pd(double *dp, __m128d a)
1099 {
1100 #if HAS_BUILTIN(__builtin_shufflevector)
1101 a = __builtin_shufflevector((__v2df)a, (__v2df)a, 1, 0);
1102 *(__m128d *)dp = a;
1103 #else
1104 dp[0] = a[1];
1105 dp[1] = a[0];
1106 #endif
1107 }
1108
_mm_storeh_pd(double * dp,__m128d a)1109 __INTRIN_INLINE_SSE2 void _mm_storeh_pd(double *dp, __m128d a)
1110 {
1111 struct __mm_storeh_pd_struct {
1112 double __u;
1113 } __attribute__((__packed__, __may_alias__));
1114 ((struct __mm_storeh_pd_struct *)dp)->__u = a[1];
1115 }
1116
_mm_storel_pd(double * dp,__m128d a)1117 __INTRIN_INLINE_SSE2 void _mm_storel_pd(double *dp, __m128d a)
1118 {
1119 struct __mm_storeh_pd_struct {
1120 double __u;
1121 } __attribute__((__packed__, __may_alias__));
1122 ((struct __mm_storeh_pd_struct *)dp)->__u = a[0];
1123 }
1124
_mm_add_epi8(__m128i a,__m128i b)1125 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi8(__m128i a, __m128i b)
1126 {
1127 return (__m128i)((__v16qu)a + (__v16qu)b);
1128 }
1129
_mm_add_epi16(__m128i a,__m128i b)1130 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi16(__m128i a, __m128i b)
1131 {
1132 return (__m128i)((__v8hu)a + (__v8hu)b);
1133 }
1134
_mm_add_epi32(__m128i a,__m128i b)1135 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi32(__m128i a, __m128i b)
1136 {
1137 return (__m128i)((__v4su)a + (__v4su)b);
1138 }
1139
_mm_add_si64(__m64 a,__m64 b)1140 __INTRIN_INLINE_MMXSSE2 __m64 _mm_add_si64(__m64 a, __m64 b)
1141 {
1142 return (__m64)__builtin_ia32_paddq((__v1di)a, (__v1di)b);
1143 }
1144
_mm_add_epi64(__m128i a,__m128i b)1145 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi64(__m128i a, __m128i b)
1146 {
1147 return (__m128i)((__v2du)a + (__v2du)b);
1148 }
1149
_mm_adds_epi8(__m128i a,__m128i b)1150 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epi8(__m128i a, __m128i b)
1151 {
1152 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1153 return (__m128i)__builtin_elementwise_add_sat((__v16qs)a, (__v16qs)b);
1154 #else
1155 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
1156 #endif
1157 }
1158
_mm_adds_epi16(__m128i a,__m128i b)1159 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epi16(__m128i a, __m128i b)
1160 {
1161 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1162 return (__m128i)__builtin_elementwise_add_sat((__v8hi)a, (__v8hi)b);
1163 #else
1164 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
1165 #endif
1166 }
1167
_mm_adds_epu8(__m128i a,__m128i b)1168 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epu8(__m128i a, __m128i b)
1169 {
1170 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1171 return (__m128i)__builtin_elementwise_add_sat((__v16qu)a, (__v16qu)b);
1172 #else
1173 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
1174 #endif
1175 }
1176
_mm_adds_epu16(__m128i a,__m128i b)1177 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epu16(__m128i a, __m128i b)
1178 {
1179 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1180 return (__m128i)__builtin_elementwise_add_sat((__v8hu)a, (__v8hu)b);
1181 #else
1182 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
1183 #endif
1184 }
1185
_mm_avg_epu8(__m128i a,__m128i b)1186 __INTRIN_INLINE_SSE2 __m128i _mm_avg_epu8(__m128i a, __m128i b)
1187 {
1188 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
1189 }
1190
_mm_avg_epu16(__m128i a,__m128i b)1191 __INTRIN_INLINE_SSE2 __m128i _mm_avg_epu16(__m128i a, __m128i b)
1192 {
1193 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
1194 }
1195
_mm_madd_epi16(__m128i a,__m128i b)1196 __INTRIN_INLINE_SSE2 __m128i _mm_madd_epi16(__m128i a, __m128i b)
1197 {
1198 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
1199 }
1200
_mm_max_epi16(__m128i a,__m128i b)1201 __INTRIN_INLINE_SSE2 __m128i _mm_max_epi16(__m128i a, __m128i b)
1202 {
1203 #if HAS_BUILTIN(__builtin_elementwise_max)
1204 return (__m128i)__builtin_elementwise_max((__v8hi)a, (__v8hi)b);
1205 #else
1206 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
1207 #endif
1208 }
1209
_mm_max_epu8(__m128i a,__m128i b)1210 __INTRIN_INLINE_SSE2 __m128i _mm_max_epu8(__m128i a, __m128i b)
1211 {
1212 #if HAS_BUILTIN(__builtin_elementwise_max)
1213 return (__m128i)__builtin_elementwise_max((__v16qu)a, (__v16qu)b);
1214 #else
1215 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
1216 #endif
1217 }
1218
_mm_min_epi16(__m128i a,__m128i b)1219 __INTRIN_INLINE_SSE2 __m128i _mm_min_epi16(__m128i a, __m128i b)
1220 {
1221 #if HAS_BUILTIN(__builtin_elementwise_min)
1222 return (__m128i)__builtin_elementwise_min((__v8hi)a, (__v8hi)b);
1223 #else
1224 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
1225 #endif
1226 }
1227
_mm_min_epu8(__m128i a,__m128i b)1228 __INTRIN_INLINE_SSE2 __m128i _mm_min_epu8(__m128i a, __m128i b)
1229 {
1230 #if HAS_BUILTIN(__builtin_elementwise_min)
1231 return (__m128i)__builtin_elementwise_min((__v16qu)a, (__v16qu)b);
1232 #else
1233 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
1234 #endif
1235 }
1236
_mm_mulhi_epi16(__m128i a,__m128i b)1237 __INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
1238 {
1239 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
1240 }
1241
_mm_mulhi_epu16(__m128i a,__m128i b)1242 __INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
1243 {
1244 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
1245 }
1246
_mm_mullo_epi16(__m128i a,__m128i b)1247 __INTRIN_INLINE_SSE2 __m128i _mm_mullo_epi16(__m128i a, __m128i b)
1248 {
1249 return (__m128i)((__v8hu)a * (__v8hu)b);
1250 }
1251
_mm_mul_su32(__m64 a,__m64 b)1252 __INTRIN_INLINE_MMXSSE2 __m64 _mm_mul_su32(__m64 a, __m64 b)
1253 {
1254 return (__m64)__builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
1255 }
1256
_mm_mul_epu32(__m128i a,__m128i b)1257 __INTRIN_INLINE_SSE2 __m128i _mm_mul_epu32(__m128i a, __m128i b)
1258 {
1259 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
1260 }
1261
_mm_sad_epu8(__m128i a,__m128i b)1262 __INTRIN_INLINE_SSE2 __m128i _mm_sad_epu8(__m128i a, __m128i b)
1263 {
1264 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
1265 }
1266
_mm_sub_epi8(__m128i a,__m128i b)1267 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi8(__m128i a, __m128i b)
1268 {
1269 return (__m128i)((__v16qu)a - (__v16qu)b);
1270 }
1271
_mm_sub_epi16(__m128i a,__m128i b)1272 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi16(__m128i a, __m128i b)
1273 {
1274 return (__m128i)((__v8hu)a - (__v8hu)b);
1275 }
1276
_mm_sub_epi32(__m128i a,__m128i b)1277 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi32(__m128i a, __m128i b)
1278 {
1279 return (__m128i)((__v4su)a - (__v4su)b);
1280 }
1281
_mm_sub_si64(__m64 a,__m64 b)1282 __INTRIN_INLINE_MMXSSE2 __m64 _mm_sub_si64(__m64 a, __m64 b)
1283 {
1284 return (__m64)__builtin_ia32_psubq((__v1di)a, (__v1di)b);
1285 }
1286
_mm_sub_epi64(__m128i a,__m128i b)1287 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi64(__m128i a, __m128i b)
1288 {
1289 return (__m128i)((__v2du)a - (__v2du)b);
1290 }
1291
_mm_subs_epi8(__m128i a,__m128i b)1292 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epi8(__m128i a, __m128i b)
1293 {
1294 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1295 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)a, (__v16qs)b);
1296 #else
1297 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
1298 #endif
1299 }
1300
_mm_subs_epi16(__m128i a,__m128i b)1301 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epi16(__m128i a, __m128i b)
1302 {
1303 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1304 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)a, (__v8hi)b);
1305 #else
1306 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
1307 #endif
1308 }
1309
_mm_subs_epu8(__m128i a,__m128i b)1310 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epu8(__m128i a, __m128i b)
1311 {
1312 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1313 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)a, (__v16qu)b);
1314 #else
1315 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
1316 #endif
1317 }
1318
_mm_subs_epu16(__m128i a,__m128i b)1319 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epu16(__m128i a, __m128i b)
1320 {
1321 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1322 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)a, (__v8hu)b);
1323 #else
1324 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
1325 #endif
1326 }
1327
_mm_and_si128(__m128i a,__m128i b)1328 __INTRIN_INLINE_SSE2 __m128i _mm_and_si128(__m128i a, __m128i b)
1329 {
1330 return (__m128i)((__v2du)a & (__v2du)b);
1331 }
1332
_mm_andnot_si128(__m128i a,__m128i b)1333 __INTRIN_INLINE_SSE2 __m128i _mm_andnot_si128(__m128i a, __m128i b)
1334 {
1335 return (__m128i)(~(__v2du)a & (__v2du)b);
1336 }
1337
_mm_or_si128(__m128i a,__m128i b)1338 __INTRIN_INLINE_SSE2 __m128i _mm_or_si128(__m128i a, __m128i b)
1339 {
1340 return (__m128i)((__v2du)a | (__v2du)b);
1341 }
1342
_mm_xor_si128(__m128i a,__m128i b)1343 __INTRIN_INLINE_SSE2 __m128i _mm_xor_si128(__m128i a, __m128i b)
1344 {
1345 return (__m128i)((__v2du)a ^ (__v2du)b);
1346 }
1347
1348 #define _mm_slli_si128(a, imm) \
1349 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
1350
_mm_slli_epi16(__m128i a,int count)1351 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi16(__m128i a, int count)
1352 {
1353 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
1354 }
1355
_mm_sll_epi16(__m128i a,__m128i count)1356 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi16(__m128i a, __m128i count)
1357 {
1358 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
1359 }
1360
_mm_slli_epi32(__m128i a,int count)1361 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi32(__m128i a, int count)
1362 {
1363 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
1364 }
1365
_mm_sll_epi32(__m128i a,__m128i count)1366 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi32(__m128i a, __m128i count)
1367 {
1368 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
1369 }
1370
_mm_slli_epi64(__m128i a,int count)1371 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi64(__m128i a, int count)
1372 {
1373 return __builtin_ia32_psllqi128((__v2di)a, count);
1374 }
1375
_mm_sll_epi64(__m128i a,__m128i count)1376 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi64(__m128i a, __m128i count)
1377 {
1378 return __builtin_ia32_psllq128((__v2di)a, (__v2di)count);
1379 }
1380
_mm_srai_epi16(__m128i a,int count)1381 __INTRIN_INLINE_SSE2 __m128i _mm_srai_epi16(__m128i a, int count)
1382 {
1383 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
1384 }
1385
_mm_sra_epi16(__m128i a,__m128i count)1386 __INTRIN_INLINE_SSE2 __m128i _mm_sra_epi16(__m128i a, __m128i count)
1387 {
1388 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
1389 }
1390
_mm_srai_epi32(__m128i a,int count)1391 __INTRIN_INLINE_SSE2 __m128i _mm_srai_epi32(__m128i a, int count)
1392 {
1393 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
1394 }
1395
_mm_sra_epi32(__m128i a,__m128i count)1396 __INTRIN_INLINE_SSE2 __m128i _mm_sra_epi32(__m128i a, __m128i count)
1397 {
1398 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
1399 }
1400
1401 #define _mm_srli_si128(a, imm) \
1402 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
1403
_mm_srli_epi16(__m128i a,int count)1404 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi16(__m128i a, int count)
1405 {
1406 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
1407 }
1408
_mm_srl_epi16(__m128i a,__m128i count)1409 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi16(__m128i a, __m128i count)
1410 {
1411 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
1412 }
1413
_mm_srli_epi32(__m128i a,int count)1414 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi32(__m128i a, int count)
1415 {
1416 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
1417 }
1418
_mm_srl_epi32(__m128i a,__m128i count)1419 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi32(__m128i a, __m128i count)
1420 {
1421 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
1422 }
1423
_mm_srli_epi64(__m128i a,int count)1424 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi64(__m128i a, int count)
1425 {
1426 return __builtin_ia32_psrlqi128((__v2di)a, count);
1427 }
1428
_mm_srl_epi64(__m128i a,__m128i count)1429 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi64(__m128i a, __m128i count)
1430 {
1431 return __builtin_ia32_psrlq128((__v2di)a, (__v2di)count);
1432 }
1433
_mm_cmpeq_epi8(__m128i a,__m128i b)1434 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
1435 {
1436 return (__m128i)((__v16qi)a == (__v16qi)b);
1437 }
1438
_mm_cmpeq_epi16(__m128i a,__m128i b)1439 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
1440 {
1441 return (__m128i)((__v8hi)a == (__v8hi)b);
1442 }
1443
_mm_cmpeq_epi32(__m128i a,__m128i b)1444 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
1445 {
1446 return (__m128i)((__v4si)a == (__v4si)b);
1447 }
1448
_mm_cmpgt_epi8(__m128i a,__m128i b)1449 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
1450 {
1451 /* This function always performs a signed comparison, but __v16qi is a char
1452 which may be signed or unsigned, so use __v16qs. */
1453 return (__m128i)((__v16qs)a > (__v16qs)b);
1454 }
1455
_mm_cmpgt_epi16(__m128i a,__m128i b)1456 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
1457 {
1458 return (__m128i)((__v8hi)a > (__v8hi)b);
1459 }
1460
_mm_cmpgt_epi32(__m128i a,__m128i b)1461 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
1462 {
1463 return (__m128i)((__v4si)a > (__v4si)b);
1464 }
1465
_mm_cmplt_epi8(__m128i a,__m128i b)1466 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
1467 {
1468 return _mm_cmpgt_epi8(b, a);
1469 }
1470
_mm_cmplt_epi16(__m128i a,__m128i b)1471 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
1472 {
1473 return _mm_cmpgt_epi16(b, a);
1474 }
1475
_mm_cmplt_epi32(__m128i a,__m128i b)1476 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
1477 {
1478 return _mm_cmpgt_epi32(b, a);
1479 }
1480
1481 #ifdef _M_AMD64
1482
_mm_cvtsi64_sd(__m128d a,long long b)1483 __INTRIN_INLINE_SSE2 __m128d _mm_cvtsi64_sd(__m128d a, long long b)
1484 {
1485 a[0] = b;
1486 return a;
1487 }
1488
_mm_cvtsd_si64(__m128d a)1489 __INTRIN_INLINE_SSE2 long long _mm_cvtsd_si64(__m128d a)
1490 {
1491 return __builtin_ia32_cvtsd2si64((__v2df)a);
1492 }
1493
_mm_cvttsd_si64(__m128d a)1494 __INTRIN_INLINE_SSE2 long long _mm_cvttsd_si64(__m128d a)
1495 {
1496 return __builtin_ia32_cvttsd2si64((__v2df)a);
1497 }
1498 #endif
1499
_mm_cvtepi32_ps(__m128i a)1500 __INTRIN_INLINE_SSE2 __m128 _mm_cvtepi32_ps(__m128i a)
1501 {
1502 #if HAS_BUILTIN(__builtin_convertvector)
1503 return (__m128)__builtin_convertvector((__v4si)a, __v4sf);
1504 #else
1505 return __builtin_ia32_cvtdq2ps((__v4si)a);
1506 #endif
1507 }
1508
_mm_cvtps_epi32(__m128 a)1509 __INTRIN_INLINE_SSE2 __m128i _mm_cvtps_epi32(__m128 a)
1510 {
1511 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)a);
1512 }
1513
_mm_cvttps_epi32(__m128 a)1514 __INTRIN_INLINE_SSE2 __m128i _mm_cvttps_epi32(__m128 a)
1515 {
1516 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)a);
1517 }
1518
_mm_cvtsi32_si128(int a)1519 __INTRIN_INLINE_SSE2 __m128i _mm_cvtsi32_si128(int a)
1520 {
1521 return __extension__(__m128i)(__v4si){a, 0, 0, 0};
1522 }
1523
_mm_cvtsi64_si128(long long a)1524 __INTRIN_INLINE_SSE2 __m128i _mm_cvtsi64_si128(long long a)
1525 {
1526 return __extension__(__m128i)(__v2di){a, 0};
1527 }
1528
_mm_cvtsi128_si32(__m128i a)1529 __INTRIN_INLINE_SSE2 int _mm_cvtsi128_si32(__m128i a)
1530 {
1531 __v4si b = (__v4si)a;
1532 return b[0];
1533 }
1534
_mm_cvtsi128_si64(__m128i a)1535 __INTRIN_INLINE_SSE2 long long _mm_cvtsi128_si64(__m128i a)
1536 {
1537 return a[0];
1538 }
1539
_mm_load_si128(__m128i const * p)1540 __INTRIN_INLINE_SSE2 __m128i _mm_load_si128(__m128i const *p)
1541 {
1542 return *p;
1543 }
1544
_mm_loadu_si128(__m128i_u const * p)1545 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si128(__m128i_u const *p)
1546 {
1547 struct __loadu_si128 {
1548 __m128i_u __v;
1549 } __attribute__((__packed__, __may_alias__));
1550 return ((const struct __loadu_si128 *)p)->__v;
1551 }
1552
_mm_loadl_epi64(__m128i_u const * p)1553 __INTRIN_INLINE_SSE2 __m128i _mm_loadl_epi64(__m128i_u const *p)
1554 {
1555 struct __mm_loadl_epi64_struct {
1556 long long __u;
1557 } __attribute__((__packed__, __may_alias__));
1558 return __extension__(__m128i){
1559 ((const struct __mm_loadl_epi64_struct *)p)->__u, 0};
1560 }
1561
_mm_undefined_si128(void)1562 __INTRIN_INLINE_SSE2 __m128i _mm_undefined_si128(void)
1563 {
1564 #if HAS_BUILTIN(__builtin_ia32_undef128)
1565 return (__m128i)__builtin_ia32_undef128();
1566 #else
1567 __m128i undef = undef;
1568 return undef;
1569 #endif
1570 }
1571
_mm_set_epi64x(long long q1,long long q0)1572 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi64x(long long q1, long long q0)
1573 {
1574 return __extension__(__m128i)(__v2di){q0, q1};
1575 }
1576
_mm_set_epi64(__m64 q1,__m64 q0)1577 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi64(__m64 q1, __m64 q0)
1578 {
1579 return _mm_set_epi64x((long long)q1, (long long)q0);
1580 }
1581
_mm_set_epi32(int i3,int i2,int i1,int i0)1582 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
1583 {
1584 return __extension__(__m128i)(__v4si){i0, i1, i2, i3};
1585 }
1586
_mm_set_epi16(short w7,short w6,short w5,short w4,short w3,short w2,short w1,short w0)1587 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi16(
1588 short w7, short w6, short w5, short w4,
1589 short w3, short w2, short w1, short w0)
1590 {
1591 return __extension__(__m128i)(__v8hi){w0, w1, w2, w3, w4, w5, w6, w7};
1592 }
1593
_mm_set_epi8(char b15,char b14,char b13,char b12,char b11,char b10,char b9,char b8,char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0)1594 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi8(
1595 char b15, char b14, char b13, char b12,
1596 char b11, char b10, char b9, char b8,
1597 char b7, char b6, char b5, char b4,
1598 char b3, char b2, char b1, char b0)
1599 {
1600 return __extension__(__m128i)(__v16qi){
1601 b0, b1, b2, b3, b4, b5, b6, b7,
1602 b8, b9, b10, b11, b12, b13, b14, b15};
1603 }
1604
_mm_set1_epi64x(long long q)1605 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64x(long long q)
1606 {
1607 return _mm_set_epi64x(q, q);
1608 }
1609
_mm_set1_epi64(__m64 q)1610 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64(__m64 q)
1611 {
1612 return _mm_set_epi64(q, q);
1613 }
1614
_mm_set1_epi32(int i)1615 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi32(int i)
1616 {
1617 return _mm_set_epi32(i, i, i, i);
1618 }
1619
_mm_set1_epi16(short w)1620 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi16(short w)
1621 {
1622 return _mm_set_epi16(w, w, w, w, w, w, w, w);
1623 }
1624
_mm_set1_epi8(char b)1625 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi8(char b)
1626 {
1627 return _mm_set_epi8(b, b, b, b, b, b, b, b, b, b, b,
1628 b, b, b, b, b);
1629 }
1630
_mm_setr_epi64(__m64 q0,__m64 q1)1631 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi64(__m64 q0, __m64 q1)
1632 {
1633 return _mm_set_epi64(q1, q0);
1634 }
1635
_mm_setr_epi32(int i0,int i1,int i2,int i3)1636 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3)
1637 {
1638 return _mm_set_epi32(i3, i2, i1, i0);
1639 }
1640
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)1641 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi16(
1642 short w0, short w1, short w2, short w3,
1643 short w4, short w5, short w6, short w7)
1644 {
1645 return _mm_set_epi16(w7, w6, w5, w4, w3, w2, w1, w0);
1646 }
1647
_mm_setr_epi8(char b0,char b1,char b2,char b3,char b4,char b5,char b6,char b7,char b8,char b9,char b10,char b11,char b12,char b13,char b14,char b15)1648 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi8(
1649 char b0, char b1, char b2, char b3,
1650 char b4, char b5, char b6, char b7,
1651 char b8, char b9, char b10, char b11,
1652 char b12, char b13, char b14, char b15)
1653 {
1654 return _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8,
1655 b7, b6, b5, b4, b3, b2, b1, b0);
1656 }
1657
_mm_setzero_si128(void)1658 __INTRIN_INLINE_SSE2 __m128i _mm_setzero_si128(void)
1659 {
1660 return __extension__(__m128i)(__v2di){0LL, 0LL};
1661 }
1662
_mm_store_si128(__m128i * p,__m128i b)1663 __INTRIN_INLINE_SSE2 void _mm_store_si128(__m128i *p, __m128i b)
1664 {
1665 *p = b;
1666 }
1667
_mm_storeu_si128(__m128i_u * p,__m128i b)1668 __INTRIN_INLINE_SSE2 void _mm_storeu_si128(__m128i_u *p, __m128i b)
1669 {
1670 struct __storeu_si128 {
1671 __m128i_u __v;
1672 } __attribute__((__packed__, __may_alias__));
1673 ((struct __storeu_si128 *)p)->__v = b;
1674 }
1675
_mm_storeu_si64(void * p,__m128i b)1676 __INTRIN_INLINE_SSE2 void _mm_storeu_si64(void *p, __m128i b)
1677 {
1678 struct __storeu_si64 {
1679 long long __v;
1680 } __attribute__((__packed__, __may_alias__));
1681 ((struct __storeu_si64 *)p)->__v = ((__v2di)b)[0];
1682 }
1683
_mm_storeu_si32(void * p,__m128i b)1684 __INTRIN_INLINE_SSE2 void _mm_storeu_si32(void *p, __m128i b)
1685 {
1686 struct __storeu_si32 {
1687 int __v;
1688 } __attribute__((__packed__, __may_alias__));
1689 ((struct __storeu_si32 *)p)->__v = ((__v4si)b)[0];
1690 }
1691
_mm_storeu_si16(void * p,__m128i b)1692 __INTRIN_INLINE_SSE2 void _mm_storeu_si16(void *p, __m128i b)
1693 {
1694 struct __storeu_si16 {
1695 short __v;
1696 } __attribute__((__packed__, __may_alias__));
1697 ((struct __storeu_si16 *)p)->__v = ((__v8hi)b)[0];
1698 }
1699
_mm_maskmoveu_si128(__m128i d,__m128i n,char * p)1700 __INTRIN_INLINE_SSE2 void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1701 {
1702 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1703 }
1704
_mm_storel_epi64(__m128i_u * p,__m128i a)1705 __INTRIN_INLINE_SSE2 void _mm_storel_epi64(__m128i_u *p, __m128i a)
1706 {
1707 struct __mm_storel_epi64_struct {
1708 long long __u;
1709 } __attribute__((__packed__, __may_alias__));
1710 ((struct __mm_storel_epi64_struct *)p)->__u = a[0];
1711 }
1712
_mm_stream_pd(double * p,__m128d a)1713 __INTRIN_INLINE_SSE2 void _mm_stream_pd(double *p, __m128d a)
1714 {
1715 #if HAS_BUILTIN(__builtin_nontemporal_store)
1716 __builtin_nontemporal_store((__v2df)a, (__v2df *)p);
1717 #else
1718 __builtin_ia32_movntpd(p, a);
1719 #endif
1720 }
1721
_mm_stream_si128(__m128i * p,__m128i a)1722 __INTRIN_INLINE_SSE2 void _mm_stream_si128(__m128i *p, __m128i a)
1723 {
1724 #if HAS_BUILTIN(__builtin_nontemporal_store)
1725 __builtin_nontemporal_store((__v2di)a, (__v2di*)p);
1726 #else
1727 __builtin_ia32_movntdq(p, a);
1728 #endif
1729 }
1730
_mm_stream_si32(int * p,int a)1731 __INTRIN_INLINE_SSE2 void _mm_stream_si32(int *p, int a)
1732 {
1733 __builtin_ia32_movnti(p, a);
1734 }
1735
1736 #ifdef _M_AMD64
_mm_stream_si64(long long * p,long long a)1737 __INTRIN_INLINE_SSE2 void _mm_stream_si64(long long *p, long long a)
1738 {
1739 __builtin_ia32_movnti64(p, a);
1740 }
1741 #endif
1742
1743 void _mm_clflush(void const *p);
1744
1745 void _mm_lfence(void);
1746
1747 void _mm_mfence(void);
1748
_mm_packs_epi16(__m128i a,__m128i b)1749 __INTRIN_INLINE_SSE2 __m128i _mm_packs_epi16(__m128i a, __m128i b)
1750 {
1751 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1752 }
1753
_mm_packs_epi32(__m128i a,__m128i b)1754 __INTRIN_INLINE_SSE2 __m128i _mm_packs_epi32(__m128i a, __m128i b)
1755 {
1756 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1757 }
1758
_mm_packus_epi16(__m128i a,__m128i b)1759 __INTRIN_INLINE_SSE2 __m128i _mm_packus_epi16(__m128i a, __m128i b)
1760 {
1761 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1762 }
1763
1764 #define _mm_extract_epi16(a, imm) \
1765 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
1766 (int)(imm)))
1767
1768 #define _mm_insert_epi16(a, b, imm) \
1769 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
1770 (int)(imm)))
1771
_mm_movemask_epi8(__m128i a)1772 __INTRIN_INLINE_SSE2 int _mm_movemask_epi8(__m128i a)
1773 {
1774 return __builtin_ia32_pmovmskb128((__v16qi)a);
1775 }
1776
1777 #define _mm_shuffle_epi32(a, imm) \
1778 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
1779
1780 #define _mm_shufflelo_epi16(a, imm) \
1781 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
1782
1783 #define _mm_shufflehi_epi16(a, imm) \
1784 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
1785
_mm_unpackhi_epi8(__m128i a,__m128i b)1786 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
1787 {
1788 #if HAS_BUILTIN(__builtin_shufflevector)
1789 return (__m128i)__builtin_shufflevector(
1790 (__v16qi)a, (__v16qi)b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
1791 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
1792 #else
1793 return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)a, (__v16qi)b);
1794 #endif
1795 }
1796
_mm_unpackhi_epi16(__m128i a,__m128i b)1797 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
1798 {
1799 #if HAS_BUILTIN(__builtin_shufflevector)
1800 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8 + 4, 5,
1801 8 + 5, 6, 8 + 6, 7, 8 + 7);
1802 #else
1803 return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)a, (__v8hi)b);
1804 #endif
1805 }
1806
_mm_unpackhi_epi32(__m128i a,__m128i b)1807 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
1808 {
1809 #if HAS_BUILTIN(__builtin_shufflevector)
1810 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4 + 2, 3,
1811 4 + 3);
1812 #else
1813 return (__m128i)__builtin_ia32_punpckhdq128((__v4si)a, (__v4si)b);
1814 #endif
1815 }
1816
_mm_unpackhi_epi64(__m128i a,__m128i b)1817 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
1818 {
1819 #if HAS_BUILTIN(__builtin_shufflevector)
1820 return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 1, 2 + 1);
1821 #else
1822 return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)a, (__v2di)b);
1823 #endif
1824 }
1825
_mm_unpacklo_epi8(__m128i a,__m128i b)1826 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
1827 {
1828 #if HAS_BUILTIN(__builtin_shufflevector)
1829 return (__m128i)__builtin_shufflevector(
1830 (__v16qi)a, (__v16qi)b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
1831 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
1832 #else
1833 return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)a, (__v16qi)b);
1834 #endif
1835 }
1836
_mm_unpacklo_epi16(__m128i a,__m128i b)1837 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
1838 {
1839 #if HAS_BUILTIN(__builtin_shufflevector)
1840 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8 + 0, 1,
1841 8 + 1, 2, 8 + 2, 3, 8 + 3);
1842 #else
1843 return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)a, (__v8hi)b);
1844 #endif
1845 }
1846
_mm_unpacklo_epi32(__m128i a,__m128i b)1847 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
1848 {
1849 #if HAS_BUILTIN(__builtin_shufflevector)
1850 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4 + 0, 1,
1851 4 + 1);
1852 #else
1853 return (__m128i)__builtin_ia32_punpckldq128((__v4si)a, (__v4si)b);
1854 #endif
1855 }
1856
_mm_unpacklo_epi64(__m128i a,__m128i b)1857 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
1858 {
1859 #if HAS_BUILTIN(__builtin_shufflevector)
1860 return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 0, 2 + 0);
1861 #else
1862 return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)a, (__v2di)b);
1863 #endif
1864 }
1865
_mm_movepi64_pi64(__m128i a)1866 __INTRIN_INLINE_SSE2 __m64 _mm_movepi64_pi64(__m128i a)
1867 {
1868 return (__m64)a[0];
1869 }
1870
_mm_movpi64_epi64(__m64 a)1871 __INTRIN_INLINE_SSE2 __m128i _mm_movpi64_epi64(__m64 a)
1872 {
1873 return __extension__(__m128i)(__v2di){(long long)a, 0};
1874 }
1875
_mm_move_epi64(__m128i a)1876 __INTRIN_INLINE_SSE2 __m128i _mm_move_epi64(__m128i a)
1877 {
1878 #if HAS_BUILTIN(__builtin_shufflevector)
1879 return __builtin_shufflevector((__v2di)a, _mm_setzero_si128(), 0, 2);
1880 #else
1881 return (__m128i)__builtin_ia32_movq128((__v2di)a);
1882 #endif
1883 }
1884
_mm_unpackhi_pd(__m128d a,__m128d b)1885 __INTRIN_INLINE_SSE2 __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
1886 {
1887 #if HAS_BUILTIN(__builtin_shufflevector)
1888 return __builtin_shufflevector((__v2df)a, (__v2df)b, 1, 2 + 1);
1889 #else
1890 return (__m128d)__builtin_ia32_unpckhpd((__v2df)a, (__v2df)b);
1891 #endif
1892 }
1893
_mm_unpacklo_pd(__m128d a,__m128d b)1894 __INTRIN_INLINE_SSE2 __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
1895 {
1896 #if HAS_BUILTIN(__builtin_shufflevector)
1897 return __builtin_shufflevector((__v2df)a, (__v2df)b, 0, 2 + 0);
1898 #else
1899 return (__m128d)__builtin_ia32_unpcklpd((__v2df)a, (__v2df)b);
1900 #endif
1901 }
1902
_mm_movemask_pd(__m128d a)1903 __INTRIN_INLINE_SSE2 int _mm_movemask_pd(__m128d a)
1904 {
1905 return __builtin_ia32_movmskpd((__v2df)a);
1906 }
1907
1908 #define _mm_shuffle_pd(a, b, i) \
1909 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
1910 (int)(i)))
1911
_mm_castpd_ps(__m128d a)1912 __INTRIN_INLINE_SSE2 __m128 _mm_castpd_ps(__m128d a)
1913 {
1914 return (__m128)a;
1915 }
1916
_mm_castpd_si128(__m128d a)1917 __INTRIN_INLINE_SSE2 __m128i _mm_castpd_si128(__m128d a)
1918 {
1919 return (__m128i)a;
1920 }
1921
_mm_castps_pd(__m128 a)1922 __INTRIN_INLINE_SSE2 __m128d _mm_castps_pd(__m128 a)
1923 {
1924 return (__m128d)a;
1925 }
1926
_mm_castps_si128(__m128 a)1927 __INTRIN_INLINE_SSE2 __m128i _mm_castps_si128(__m128 a)
1928 {
1929 return (__m128i)a;
1930 }
1931
_mm_castsi128_ps(__m128i a)1932 __INTRIN_INLINE_SSE2 __m128 _mm_castsi128_ps(__m128i a)
1933 {
1934 return (__m128)a;
1935 }
1936
_mm_castsi128_pd(__m128i a)1937 __INTRIN_INLINE_SSE2 __m128d _mm_castsi128_pd(__m128i a)
1938 {
1939 return (__m128d)a;
1940 }
1941
1942 void _mm_pause(void);
1943
1944 #endif /* _MSC_VER */
1945
1946 #ifdef __cplusplus
1947 } // extern "C"
1948 #endif
1949
1950 #endif /* _INCLUDED_EMM */
1951