1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #pragma once
11 #ifndef _INCLUDED_EMM
12 #define _INCLUDED_EMM
13
14 #include <vcruntime.h>
15 #include <xmmintrin.h>
16
17 #if defined(_MSC_VER) && !defined(__clang__)
18
19 typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128i
20 {
21 __int8 m128i_i8[16];
22 __int16 m128i_i16[8];
23 __int32 m128i_i32[4];
24 __int64 m128i_i64[2];
25 unsigned __int8 m128i_u8[16];
26 unsigned __int16 m128i_u16[8];
27 unsigned __int32 m128i_u32[4];
28 unsigned __int64 m128i_u64[2];
29 } __m128i;
30 _STATIC_ASSERT(sizeof(__m128i) == 16);
31
32 typedef struct _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128d
33 {
34 double m128d_f64[2];
35 } __m128d;
36
37 typedef __declspec(align(1)) __m128i __m128i_u;
38
39 #define __ATTRIBUTE_SSE2__
40
41 #else /* _MSC_VER */
42
43 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
45
46 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
47 typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
48
49 /* Type defines. */
50 typedef double __v2df __attribute__((__vector_size__(16)));
51 typedef long long __v2di __attribute__((__vector_size__(16)));
52 typedef short __v8hi __attribute__((__vector_size__(16)));
53 typedef char __v16qi __attribute__((__vector_size__(16)));
54
55 /* Unsigned types */
56 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
57 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
58 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
59
60 /* We need an explicitly signed variant for char. Note that this shouldn't
61 * appear in the interface though. */
62 typedef signed char __v16qs __attribute__((__vector_size__(16)));
63
64 #ifdef __clang__
65 #define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"),__min_vector_width__(128)))
66 #else
67 #define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2")))
68 #endif
69 #define __INTRIN_INLINE_SSE2 __INTRIN_INLINE __ATTRIBUTE_SSE2__
70
71 #endif /* _MSC_VER */
72
73 extern __m128d _mm_add_sd(__m128d a, __m128d b);
74 extern __m128d _mm_add_pd(__m128d a, __m128d b);
75 extern __m128d _mm_sub_sd(__m128d a, __m128d b);
76 extern __m128d _mm_sub_pd(__m128d a, __m128d b);
77 extern __m128d _mm_mul_sd(__m128d a, __m128d b);
78 extern __m128d _mm_mul_pd(__m128d a, __m128d b);
79 extern __m128d _mm_div_sd(__m128d a, __m128d b);
80 extern __m128d _mm_div_pd(__m128d a, __m128d b);
81 extern __m128d _mm_sqrt_sd(__m128d a, __m128d b);
82 extern __m128d _mm_sqrt_pd(__m128d a);
83 extern __m128d _mm_min_sd(__m128d a, __m128d b);
84 extern __m128d _mm_min_pd(__m128d a, __m128d b);
85 extern __m128d _mm_max_sd(__m128d a, __m128d b);
86 extern __m128d _mm_max_pd(__m128d a, __m128d b);
87 extern __m128d _mm_and_pd(__m128d a, __m128d b);
88 extern __m128d _mm_andnot_pd(__m128d a, __m128d b);
89 extern __m128d _mm_or_pd(__m128d a, __m128d b);
90 extern __m128d _mm_xor_pd(__m128d a, __m128d b);
91 extern __m128d _mm_cmpeq_pd(__m128d a, __m128d b);
92 extern __m128d _mm_cmplt_pd(__m128d a, __m128d b);
93 extern __m128d _mm_cmple_pd(__m128d a, __m128d b);
94 extern __m128d _mm_cmpgt_pd(__m128d a, __m128d b);
95 extern __m128d _mm_cmpge_pd(__m128d a, __m128d b);
96 extern __m128d _mm_cmpord_pd(__m128d a, __m128d b);
97 extern __m128d _mm_cmpunord_pd(__m128d a, __m128d b);
98 extern __m128d _mm_cmpneq_pd(__m128d a, __m128d b);
99 extern __m128d _mm_cmpnlt_pd(__m128d a, __m128d b);
100 extern __m128d _mm_cmpnle_pd(__m128d a, __m128d b);
101 extern __m128d _mm_cmpngt_pd(__m128d a, __m128d b);
102 extern __m128d _mm_cmpnge_pd(__m128d a, __m128d b);
103 extern __m128d _mm_cmpeq_sd(__m128d a, __m128d b);
104 extern __m128d _mm_cmplt_sd(__m128d a, __m128d b);
105 extern __m128d _mm_cmple_sd(__m128d a, __m128d b);
106 extern __m128d _mm_cmpgt_sd(__m128d a, __m128d b);
107 extern __m128d _mm_cmpge_sd(__m128d a, __m128d b);
108 extern __m128d _mm_cmpord_sd(__m128d a, __m128d b);
109 extern __m128d _mm_cmpunord_sd(__m128d a, __m128d b);
110 extern __m128d _mm_cmpneq_sd(__m128d a, __m128d b);
111 extern __m128d _mm_cmpnlt_sd(__m128d a, __m128d b);
112 extern __m128d _mm_cmpnle_sd(__m128d a, __m128d b);
113 extern __m128d _mm_cmpngt_sd(__m128d a, __m128d b);
114 extern __m128d _mm_cmpnge_sd(__m128d a, __m128d b);
115 extern int _mm_comieq_sd(__m128d a, __m128d b);
116 extern int _mm_comilt_sd(__m128d a, __m128d b);
117 extern int _mm_comile_sd(__m128d a, __m128d b);
118 extern int _mm_comigt_sd(__m128d a, __m128d b);
119 extern int _mm_comige_sd(__m128d a, __m128d b);
120 extern int _mm_comineq_sd(__m128d a, __m128d b);
121 extern int _mm_ucomieq_sd(__m128d a, __m128d b);
122 extern int _mm_ucomilt_sd(__m128d a, __m128d b);
123 extern int _mm_ucomile_sd(__m128d a, __m128d b);
124 extern int _mm_ucomigt_sd(__m128d a, __m128d b);
125 extern int _mm_ucomige_sd(__m128d a, __m128d b);
126 extern int _mm_ucomineq_sd(__m128d a, __m128d b);
127 extern __m128 _mm_cvtpd_ps(__m128d a);
128 extern __m128d _mm_cvtps_pd(__m128 a);
129 extern __m128d _mm_cvtepi32_pd(__m128i a);
130 extern __m128i _mm_cvtpd_epi32(__m128d a);
131 extern int _mm_cvtsd_si32(__m128d a);
132 extern __m128 _mm_cvtsd_ss(__m128 a, __m128d b);
133 extern __m128d _mm_cvtsi32_sd(__m128d a, int b);
134 extern __m128d _mm_cvtss_sd(__m128d a, __m128 b);
135 extern __m128i _mm_cvttpd_epi32(__m128d a);
136 extern int _mm_cvttsd_si32(__m128d a);
137 extern __m64 _mm_cvtpd_pi32(__m128d a);
138 extern __m64 _mm_cvttpd_pi32(__m128d a);
139 extern __m128d _mm_cvtpi32_pd(__m64 a);
140 extern double _mm_cvtsd_f64(__m128d a);
141 extern __m128d _mm_load_pd(double const *dp);
142 extern __m128d _mm_load1_pd(double const *dp);
143 extern __m128d _mm_loadr_pd(double const *dp);
144 extern __m128d _mm_loadu_pd(double const *dp);
145 //extern __m128i _mm_loadu_si64(void const *a);
146 //extern __m128i _mm_loadu_si32(void const *a);
147 //extern __m128i _mm_loadu_si16(void const *a);
148 extern __m128d _mm_load_sd(double const *dp);
149 extern __m128d _mm_loadh_pd(__m128d a, double const *dp);
150 extern __m128d _mm_loadl_pd(__m128d a, double const *dp);
151 //extern __m128d _mm_undefined_pd(void);
152 extern __m128d _mm_set_sd(double w);
153 extern __m128d _mm_set1_pd(double w);
154 extern __m128d _mm_set_pd(double w, double x);
155 extern __m128d _mm_setr_pd(double w, double x);
156 extern __m128d _mm_setzero_pd(void);
157 extern __m128d _mm_move_sd(__m128d a, __m128d b);
158 extern void _mm_store_sd(double *dp, __m128d a);
159 extern void _mm_store_pd(double *dp, __m128d a);
160 extern void _mm_store1_pd(double *dp, __m128d a);
161 extern void _mm_storeu_pd(double *dp, __m128d a);
162 extern void _mm_storer_pd(double *dp, __m128d a);
163 extern void _mm_storeh_pd(double *dp, __m128d a);
164 extern void _mm_storel_pd(double *dp, __m128d a);
165 extern __m128i _mm_add_epi8(__m128i a, __m128i b);
166 extern __m128i _mm_add_epi16(__m128i a, __m128i b);
167 extern __m128i _mm_add_epi32(__m128i a, __m128i b);
168 extern __m64 _mm_add_si64(__m64 a, __m64 b);
169 extern __m128i _mm_add_epi64(__m128i a, __m128i b);
170 extern __m128i _mm_adds_epi8(__m128i a, __m128i b);
171 extern __m128i _mm_adds_epi16(__m128i a, __m128i b);
172 extern __m128i _mm_adds_epu8(__m128i a, __m128i b);
173 extern __m128i _mm_adds_epu16(__m128i a, __m128i b);
174 extern __m128i _mm_avg_epu8(__m128i a, __m128i b);
175 extern __m128i _mm_avg_epu16(__m128i a, __m128i b);
176 extern __m128i _mm_madd_epi16(__m128i a, __m128i b);
177 extern __m128i _mm_max_epi16(__m128i a, __m128i b);
178 extern __m128i _mm_max_epu8(__m128i a, __m128i b);
179 extern __m128i _mm_min_epi16(__m128i a, __m128i b);
180 extern __m128i _mm_min_epu8(__m128i a, __m128i b);
181 extern __m128i _mm_mulhi_epi16(__m128i a, __m128i b);
182 extern __m128i _mm_mulhi_epu16(__m128i a, __m128i b);
183 extern __m128i _mm_mullo_epi16(__m128i a, __m128i b);
184 extern __m64 _mm_mul_su32(__m64 a, __m64 b);
185 extern __m128i _mm_mul_epu32(__m128i a, __m128i b);
186 extern __m128i _mm_sad_epu8(__m128i a, __m128i b);
187 extern __m128i _mm_sub_epi8(__m128i a, __m128i b);
188 extern __m128i _mm_sub_epi16(__m128i a, __m128i b);
189 extern __m128i _mm_sub_epi32(__m128i a, __m128i b);
190 extern __m64 _mm_sub_si64(__m64 a, __m64 b);
191 extern __m128i _mm_sub_epi64(__m128i a, __m128i b);
192 extern __m128i _mm_subs_epi8(__m128i a, __m128i b);
193 extern __m128i _mm_subs_epi16(__m128i a, __m128i b);
194 extern __m128i _mm_subs_epu8(__m128i a, __m128i b);
195 extern __m128i _mm_subs_epu16(__m128i a, __m128i b);
196 extern __m128i _mm_and_si128(__m128i a, __m128i b);
197 extern __m128i _mm_andnot_si128(__m128i a, __m128i b);
198 extern __m128i _mm_or_si128(__m128i a, __m128i b);
199 extern __m128i _mm_xor_si128(__m128i a, __m128i b);
200 extern __m128i _mm_slli_si128(__m128i a, int i);
201 extern __m128i _mm_slli_epi16(__m128i a, int count);
202 extern __m128i _mm_sll_epi16(__m128i a, __m128i count);
203 extern __m128i _mm_slli_epi32(__m128i a, int count);
204 extern __m128i _mm_sll_epi32(__m128i a, __m128i count);
205 extern __m128i _mm_slli_epi64(__m128i a, int count);
206 extern __m128i _mm_sll_epi64(__m128i a, __m128i count);
207 extern __m128i _mm_srai_epi16(__m128i a, int count);
208 extern __m128i _mm_sra_epi16(__m128i a, __m128i count);
209 extern __m128i _mm_srai_epi32(__m128i a, int count);
210 extern __m128i _mm_sra_epi32(__m128i a, __m128i count);
211 extern __m128i _mm_srli_si128(__m128i a, int imm);
212 extern __m128i _mm_srli_epi16(__m128i a, int count);
213 extern __m128i _mm_srl_epi16(__m128i a, __m128i count);
214 extern __m128i _mm_srli_epi32(__m128i a, int count);
215 extern __m128i _mm_srl_epi32(__m128i a, __m128i count);
216 extern __m128i _mm_srli_epi64(__m128i a, int count);
217 extern __m128i _mm_srl_epi64(__m128i a, __m128i count);
218 extern __m128i _mm_cmpeq_epi8(__m128i a, __m128i b);
219 extern __m128i _mm_cmpeq_epi16(__m128i a, __m128i b);
220 extern __m128i _mm_cmpeq_epi32(__m128i a, __m128i b);
221 extern __m128i _mm_cmpgt_epi8(__m128i a, __m128i b);
222 extern __m128i _mm_cmpgt_epi16(__m128i a, __m128i b);
223 extern __m128i _mm_cmpgt_epi32(__m128i a, __m128i b);
224 extern __m128i _mm_cmplt_epi8(__m128i a, __m128i b);
225 extern __m128i _mm_cmplt_epi16(__m128i a, __m128i b);
226 extern __m128i _mm_cmplt_epi32(__m128i a, __m128i b);
227 #ifdef _M_AMD64
228 extern __m128d _mm_cvtsi64_sd(__m128d a, long long b);
229 extern long long _mm_cvtsd_si64(__m128d a);
230 extern long long _mm_cvttsd_si64(__m128d a);
231 #endif
232 extern __m128 _mm_cvtepi32_ps(__m128i a);
233 extern __m128i _mm_cvtps_epi32(__m128 a);
234 extern __m128i _mm_cvttps_epi32(__m128 a);
235 extern __m128i _mm_cvtsi32_si128(int a);
236 #ifdef _M_AMD64
237 extern __m128i _mm_cvtsi64_si128(long long a);
238 #endif
239 extern int _mm_cvtsi128_si32(__m128i a);
240 #ifdef _M_AMD64
241 extern long long _mm_cvtsi128_si64(__m128i a);
242 #endif
243 extern __m128i _mm_load_si128(__m128i const *p);
244 extern __m128i _mm_loadu_si128(__m128i_u const *p);
245 extern __m128i _mm_loadl_epi64(__m128i_u const *p);
246 //extern __m128i _mm_undefined_si128(void);
247 //extern __m128i _mm_set_epi64x(long long q1, long long q0); // FIXME
248 extern __m128i _mm_set_epi64(__m64 q1, __m64 q0);
249 //extern __m128i _mm_set_epi32(int i3, int i1, int i0);
250 extern __m128i _mm_set_epi32(int i3, int i2, int i1, int i0);
251 //extern __m128i _mm_set_epi16(short w7, short w2, short w1, short w0);
252 extern __m128i _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0);
253 //extern __m128i _mm_set_epi8(char b15, char b10, char b4, char b3, char b2, char b1, char b0);
254 extern __m128i _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
255 //extern __m128i _mm_set1_epi64x(long long q); // FIXME
256 extern __m128i _mm_set1_epi64(__m64 q);
257 extern __m128i _mm_set1_epi32(int i);
258 extern __m128i _mm_set1_epi16(short w);
259 extern __m128i _mm_set1_epi8(char b);
260 extern __m128i _mm_setl_epi64(__m128i q); // FIXME: clang?
261 extern __m128i _mm_setr_epi64(__m64 q0, __m64 q1);
262 //extern __m128i _mm_setr_epi32(int i0, int i2, int i3);
263 extern __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3);
264 //extern __m128i _mm_setr_epi16(short w0, short w5, short w6, short w7);
265 extern __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7);
266 //extern __m128i _mm_setr_epi8(char b0, char b6, char b11, char b12, char b13, char b14, char b15);
267 extern __m128i _mm_setr_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
268 extern __m128i _mm_setzero_si128(void);
269 extern void _mm_store_si128(__m128i *p, __m128i b);
270 extern void _mm_storeu_si128(__m128i_u *p, __m128i b);
271 //extern void _mm_storeu_si64(void *p, __m128i b);
272 //extern void _mm_storeu_si32(void *p, __m128i b);
273 //extern void _mm_storeu_si16(void *p, __m128i b);
274 extern void _mm_maskmoveu_si128(__m128i d, __m128i n, _Out_writes_bytes_(16) char *p);
275 extern void _mm_storel_epi64(__m128i_u *p, __m128i a);
276 extern void _mm_stream_pd(double *p, __m128d a);
277 extern void _mm_stream_si128(__m128i *p, __m128i a);
278 extern void _mm_stream_si32(int *p, int a);
279 extern void _mm_clflush(void const *p);
280 extern void _mm_lfence(void);
281 extern void _mm_mfence(void);
282 extern __m128i _mm_packs_epi16(__m128i a, __m128i b);
283 extern __m128i _mm_packs_epi32(__m128i a, __m128i b);
284 extern __m128i _mm_packus_epi16(__m128i a, __m128i b);
285 extern int _mm_extract_epi16(__m128i a, int imm);
286 extern __m128i _mm_insert_epi16(__m128i a, int b, int imm);
287 extern int _mm_movemask_epi8(__m128i a);
288 extern __m128i _mm_shuffle_epi32(__m128i a, int imm);
289 extern __m128i _mm_shufflelo_epi16(__m128i a, int imm);
290 extern __m128i _mm_shufflehi_epi16(__m128i a, int imm);
291 extern __m128i _mm_unpackhi_epi8(__m128i a, __m128i b);
292 extern __m128i _mm_unpackhi_epi16(__m128i a, __m128i b);
293 extern __m128i _mm_unpackhi_epi32(__m128i a, __m128i b);
294 extern __m128i _mm_unpackhi_epi64(__m128i a, __m128i b);
295 extern __m128i _mm_unpacklo_epi8(__m128i a, __m128i b);
296 extern __m128i _mm_unpacklo_epi16(__m128i a, __m128i b);
297 extern __m128i _mm_unpacklo_epi32(__m128i a, __m128i b);
298 extern __m128i _mm_unpacklo_epi64(__m128i a, __m128i b);
299 extern __m64 _mm_movepi64_pi64(__m128i a);
300 extern __m128i _mm_movpi64_epi64(__m64 a);
301 extern __m128i _mm_move_epi64(__m128i a);
302 extern __m128d _mm_unpackhi_pd(__m128d a, __m128d b);
303 extern __m128d _mm_unpacklo_pd(__m128d a, __m128d b);
304 extern int _mm_movemask_pd(__m128d a);
305 extern __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm);
306 extern __m128 _mm_castpd_ps(__m128d a);
307 extern __m128i _mm_castpd_si128(__m128d a);
308 extern __m128d _mm_castps_pd(__m128 a);
309 extern __m128i _mm_castps_si128(__m128 a);
310 extern __m128 _mm_castsi128_ps(__m128i a);
311 extern __m128d _mm_castsi128_pd(__m128i a);
312 void _mm_pause(void);
313
314 /* Alternate names */
315 #define _mm_set_pd1(a) _mm_set1_pd(a)
316 #define _mm_load_pd1(p) _mm_load1_pd(p)
317 #define _mm_store_pd1(p, a) _mm_store1_pd((p), (a))
318 #define _mm_bslli_si128 _mm_slli_si128
319 #define _mm_bsrli_si128 _mm_srli_si128
320 #define _mm_stream_si64 _mm_stream_si64x
321
322 #if defined(_MSC_VER) && !defined(__clang__)
323
324 #pragma intrinsic(_mm_add_sd)
325 #pragma intrinsic(_mm_add_pd)
326 #pragma intrinsic(_mm_sub_sd)
327 #pragma intrinsic(_mm_sub_pd)
328 #pragma intrinsic(_mm_mul_sd)
329 #pragma intrinsic(_mm_mul_pd)
330 #pragma intrinsic(_mm_div_sd)
331 #pragma intrinsic(_mm_div_pd)
332 #pragma intrinsic(_mm_sqrt_sd)
333 #pragma intrinsic(_mm_sqrt_pd)
334 #pragma intrinsic(_mm_min_sd)
335 #pragma intrinsic(_mm_min_pd)
336 #pragma intrinsic(_mm_max_sd)
337 #pragma intrinsic(_mm_max_pd)
338 #pragma intrinsic(_mm_and_pd)
339 #pragma intrinsic(_mm_andnot_pd)
340 #pragma intrinsic(_mm_or_pd)
341 #pragma intrinsic(_mm_xor_pd)
342 #pragma intrinsic(_mm_cmpeq_pd)
343 #pragma intrinsic(_mm_cmplt_pd)
344 #pragma intrinsic(_mm_cmple_pd)
345 #pragma intrinsic(_mm_cmpgt_pd)
346 #pragma intrinsic(_mm_cmpge_pd)
347 #pragma intrinsic(_mm_cmpord_pd)
348 #pragma intrinsic(_mm_cmpunord_pd)
349 #pragma intrinsic(_mm_cmpneq_pd)
350 #pragma intrinsic(_mm_cmpnlt_pd)
351 #pragma intrinsic(_mm_cmpnle_pd)
352 #pragma intrinsic(_mm_cmpngt_pd)
353 #pragma intrinsic(_mm_cmpnge_pd)
354 #pragma intrinsic(_mm_cmpeq_sd)
355 #pragma intrinsic(_mm_cmplt_sd)
356 #pragma intrinsic(_mm_cmple_sd)
357 #pragma intrinsic(_mm_cmpgt_sd)
358 #pragma intrinsic(_mm_cmpge_sd)
359 #pragma intrinsic(_mm_cmpord_sd)
360 #pragma intrinsic(_mm_cmpunord_sd)
361 #pragma intrinsic(_mm_cmpneq_sd)
362 #pragma intrinsic(_mm_cmpnlt_sd)
363 #pragma intrinsic(_mm_cmpnle_sd)
364 #pragma intrinsic(_mm_cmpngt_sd)
365 #pragma intrinsic(_mm_cmpnge_sd)
366 #pragma intrinsic(_mm_comieq_sd)
367 #pragma intrinsic(_mm_comilt_sd)
368 #pragma intrinsic(_mm_comile_sd)
369 #pragma intrinsic(_mm_comigt_sd)
370 #pragma intrinsic(_mm_comige_sd)
371 #pragma intrinsic(_mm_comineq_sd)
372 #pragma intrinsic(_mm_ucomieq_sd)
373 #pragma intrinsic(_mm_ucomilt_sd)
374 #pragma intrinsic(_mm_ucomile_sd)
375 #pragma intrinsic(_mm_ucomigt_sd)
376 #pragma intrinsic(_mm_ucomige_sd)
377 #pragma intrinsic(_mm_ucomineq_sd)
378 #pragma intrinsic(_mm_cvtpd_ps)
379 #pragma intrinsic(_mm_cvtps_pd)
380 #pragma intrinsic(_mm_cvtepi32_pd)
381 #pragma intrinsic(_mm_cvtpd_epi32)
382 #pragma intrinsic(_mm_cvtsd_si32)
383 #pragma intrinsic(_mm_cvtsd_ss)
384 #pragma intrinsic(_mm_cvtsi32_sd)
385 #pragma intrinsic(_mm_cvtss_sd)
386 #pragma intrinsic(_mm_cvttpd_epi32)
387 #pragma intrinsic(_mm_cvttsd_si32)
388 //#pragma intrinsic(_mm_cvtpd_pi32)
389 //#pragma intrinsic(_mm_cvttpd_pi32)
390 //#pragma intrinsic(_mm_cvtpi32_pd)
391 #pragma intrinsic(_mm_cvtsd_f64)
392 #pragma intrinsic(_mm_load_pd)
393 #pragma intrinsic(_mm_load1_pd)
394 #pragma intrinsic(_mm_loadr_pd)
395 #pragma intrinsic(_mm_loadu_pd)
396 //#pragma intrinsic(_mm_loadu_si64)
397 //#pragma intrinsic(_mm_loadu_si32)
398 //#pragma intrinsic(_mm_loadu_si16)
399 #pragma intrinsic(_mm_load_sd)
400 #pragma intrinsic(_mm_loadh_pd)
401 #pragma intrinsic(_mm_loadl_pd)
402 //#pragma intrinsic(_mm_undefined_pd)
403 #pragma intrinsic(_mm_set_sd)
404 #pragma intrinsic(_mm_set1_pd)
405 #pragma intrinsic(_mm_set_pd)
406 #pragma intrinsic(_mm_setr_pd)
407 #pragma intrinsic(_mm_setzero_pd)
408 #pragma intrinsic(_mm_move_sd)
409 #pragma intrinsic(_mm_store_sd)
410 #pragma intrinsic(_mm_store_pd)
411 #pragma intrinsic(_mm_store1_pd)
412 #pragma intrinsic(_mm_storeu_pd)
413 #pragma intrinsic(_mm_storer_pd)
414 #pragma intrinsic(_mm_storeh_pd)
415 #pragma intrinsic(_mm_storel_pd)
416 #pragma intrinsic(_mm_add_epi8)
417 #pragma intrinsic(_mm_add_epi16)
418 #pragma intrinsic(_mm_add_epi32)
419 //#pragma intrinsic(_mm_add_si64)
420 #pragma intrinsic(_mm_add_epi64)
421 #pragma intrinsic(_mm_adds_epi8)
422 #pragma intrinsic(_mm_adds_epi16)
423 #pragma intrinsic(_mm_adds_epu8)
424 #pragma intrinsic(_mm_adds_epu16)
425 #pragma intrinsic(_mm_avg_epu8)
426 #pragma intrinsic(_mm_avg_epu16)
427 #pragma intrinsic(_mm_madd_epi16)
428 #pragma intrinsic(_mm_max_epi16)
429 #pragma intrinsic(_mm_max_epu8)
430 #pragma intrinsic(_mm_min_epi16)
431 #pragma intrinsic(_mm_min_epu8)
432 #pragma intrinsic(_mm_mulhi_epi16)
433 #pragma intrinsic(_mm_mulhi_epu16)
434 #pragma intrinsic(_mm_mullo_epi16)
435 //#pragma intrinsic(_mm_mul_su32)
436 #pragma intrinsic(_mm_mul_epu32)
437 #pragma intrinsic(_mm_sad_epu8)
438 #pragma intrinsic(_mm_sub_epi8)
439 #pragma intrinsic(_mm_sub_epi16)
440 #pragma intrinsic(_mm_sub_epi32)
441 //#pragma intrinsic(_mm_sub_si64)
442 #pragma intrinsic(_mm_sub_epi64)
443 #pragma intrinsic(_mm_subs_epi8)
444 #pragma intrinsic(_mm_subs_epi16)
445 #pragma intrinsic(_mm_subs_epu8)
446 #pragma intrinsic(_mm_subs_epu16)
447 #pragma intrinsic(_mm_and_si128)
448 #pragma intrinsic(_mm_andnot_si128)
449 #pragma intrinsic(_mm_or_si128)
450 #pragma intrinsic(_mm_xor_si128)
451 #pragma intrinsic(_mm_slli_si128)
452 #pragma intrinsic(_mm_slli_epi16)
453 #pragma intrinsic(_mm_sll_epi16)
454 #pragma intrinsic(_mm_slli_epi32)
455 #pragma intrinsic(_mm_sll_epi32)
456 #pragma intrinsic(_mm_slli_epi64)
457 #pragma intrinsic(_mm_sll_epi64)
458 #pragma intrinsic(_mm_srai_epi16)
459 #pragma intrinsic(_mm_sra_epi16)
460 #pragma intrinsic(_mm_srai_epi32)
461 #pragma intrinsic(_mm_sra_epi32)
462 #pragma intrinsic(_mm_srli_si128)
463 #pragma intrinsic(_mm_srli_epi16)
464 #pragma intrinsic(_mm_srl_epi16)
465 #pragma intrinsic(_mm_srli_epi32)
466 #pragma intrinsic(_mm_srl_epi32)
467 #pragma intrinsic(_mm_srli_epi64)
468 #pragma intrinsic(_mm_srl_epi64)
469 #pragma intrinsic(_mm_cmpeq_epi8)
470 #pragma intrinsic(_mm_cmpeq_epi16)
471 #pragma intrinsic(_mm_cmpeq_epi32)
472 #pragma intrinsic(_mm_cmpgt_epi8)
473 #pragma intrinsic(_mm_cmpgt_epi16)
474 #pragma intrinsic(_mm_cmpgt_epi32)
475 #pragma intrinsic(_mm_cmplt_epi8)
476 #pragma intrinsic(_mm_cmplt_epi16)
477 #pragma intrinsic(_mm_cmplt_epi32)
478 #ifdef _M_AMD64
479 #pragma intrinsic(_mm_cvtsi64_sd)
480 #pragma intrinsic(_mm_cvtsd_si64)
481 #pragma intrinsic(_mm_cvttsd_si64)
482 #endif
483 #pragma intrinsic(_mm_cvtepi32_ps)
484 #pragma intrinsic(_mm_cvtps_epi32)
485 #pragma intrinsic(_mm_cvttps_epi32)
486 #pragma intrinsic(_mm_cvtsi32_si128)
487 #ifdef _M_AMD64
488 #pragma intrinsic(_mm_cvtsi64_si128)
489 #endif
490 #pragma intrinsic(_mm_cvtsi128_si32)
491 #ifdef _M_AMD64
492 #pragma intrinsic(_mm_cvtsi128_si64)
493 #endif
494 #pragma intrinsic(_mm_load_si128)
495 #pragma intrinsic(_mm_loadu_si128)
496 #pragma intrinsic(_mm_loadl_epi64)
497 //#pragma intrinsic(_mm_undefined_si128)
498 #pragma intrinsic(_mm_set_epi64x)
499 //#pragma intrinsic(_mm_set_epi64)
500 #pragma intrinsic(_mm_set_epi32)
501 #pragma intrinsic(_mm_set_epi16)
502 #pragma intrinsic(_mm_set_epi8)
503 #pragma intrinsic(_mm_set1_epi64x)
504 //#pragma intrinsic(_mm_set1_epi64)
505 #pragma intrinsic(_mm_set1_epi32)
506 #pragma intrinsic(_mm_set1_epi16)
507 #pragma intrinsic(_mm_set1_epi8)
508 #pragma intrinsic(_mm_setl_epi64)
509 //#pragma intrinsic(_mm_setr_epi64)
510 #pragma intrinsic(_mm_setr_epi32)
511 #pragma intrinsic(_mm_setr_epi16)
512 #pragma intrinsic(_mm_setr_epi8)
513 #pragma intrinsic(_mm_setzero_si128)
514 #pragma intrinsic(_mm_store_si128)
515 #pragma intrinsic(_mm_storeu_si128)
516 //#pragma intrinsic(_mm_storeu_si64)
517 //#pragma intrinsic(_mm_storeu_si32)
518 //#pragma intrinsic(_mm_storeu_si16)
519 #pragma intrinsic(_mm_maskmoveu_si128)
520 #pragma intrinsic(_mm_storel_epi64)
521 #pragma intrinsic(_mm_stream_pd)
522 #pragma intrinsic(_mm_stream_si128)
523 #pragma intrinsic(_mm_stream_si32)
524 #pragma intrinsic(_mm_clflush)
525 #pragma intrinsic(_mm_lfence)
526 #pragma intrinsic(_mm_mfence)
527 #pragma intrinsic(_mm_packs_epi16)
528 #pragma intrinsic(_mm_packs_epi32)
529 #pragma intrinsic(_mm_packus_epi16)
530 #pragma intrinsic(_mm_extract_epi16)
531 #pragma intrinsic(_mm_insert_epi16)
532 #pragma intrinsic(_mm_movemask_epi8)
533 #pragma intrinsic(_mm_shuffle_epi32)
534 #pragma intrinsic(_mm_shufflelo_epi16)
535 #pragma intrinsic(_mm_shufflehi_epi16)
536 #pragma intrinsic(_mm_unpackhi_epi8)
537 #pragma intrinsic(_mm_unpackhi_epi16)
538 #pragma intrinsic(_mm_unpackhi_epi32)
539 #pragma intrinsic(_mm_unpackhi_epi64)
540 #pragma intrinsic(_mm_unpacklo_epi8)
541 #pragma intrinsic(_mm_unpacklo_epi16)
542 #pragma intrinsic(_mm_unpacklo_epi32)
543 #pragma intrinsic(_mm_unpacklo_epi64)
544 //#pragma intrinsic(_mm_movepi64_pi64)
545 //#pragma intrinsic(_mm_movpi64_epi64)
546 #pragma intrinsic(_mm_move_epi64)
547 #pragma intrinsic(_mm_unpackhi_pd)
548 #pragma intrinsic(_mm_unpacklo_pd)
549 #pragma intrinsic(_mm_movemask_pd)
550 #pragma intrinsic(_mm_shuffle_pd)
551 #pragma intrinsic(_mm_castpd_ps)
552 #pragma intrinsic(_mm_castpd_si128)
553 #pragma intrinsic(_mm_castps_pd)
554 #pragma intrinsic(_mm_castps_si128)
555 #pragma intrinsic(_mm_castsi128_ps)
556 #pragma intrinsic(_mm_castsi128_pd)
557 #pragma intrinsic(_mm_pause)
558
559 #else /* _MSC_VER */
560
561 /*
562 Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/emmintrin.h
563 Clang older version: https://github.com/llvm/llvm-project/blob/3ef88b31843e040c95f23ff2c3c206f1fa399c05/clang/lib/Headers/emmintrin.h
564 unikraft: https://github.com/unikraft/lib-intel-intrinsics/blob/staging/include/emmintrin.h
565 */
566
_mm_add_sd(__m128d a,__m128d b)567 __INTRIN_INLINE_SSE2 __m128d _mm_add_sd(__m128d a, __m128d b)
568 {
569 a[0] += b[0];
570 return a;
571 }
572
_mm_add_pd(__m128d a,__m128d b)573 __INTRIN_INLINE_SSE2 __m128d _mm_add_pd(__m128d a, __m128d b)
574 {
575 return (__m128d)((__v2df)a + (__v2df)b);
576 }
577
_mm_sub_sd(__m128d a,__m128d b)578 __INTRIN_INLINE_SSE2 __m128d _mm_sub_sd(__m128d a, __m128d b)
579 {
580 a[0] -= b[0];
581 return a;
582 }
583
_mm_sub_pd(__m128d a,__m128d b)584 __INTRIN_INLINE_SSE2 __m128d _mm_sub_pd(__m128d a, __m128d b)
585 {
586 return (__m128d)((__v2df)a - (__v2df)b);
587 }
588
_mm_mul_sd(__m128d a,__m128d b)589 __INTRIN_INLINE_SSE2 __m128d _mm_mul_sd(__m128d a, __m128d b)
590 {
591 a[0] *= b[0];
592 return a;
593 }
594
_mm_mul_pd(__m128d a,__m128d b)595 __INTRIN_INLINE_SSE2 __m128d _mm_mul_pd(__m128d a, __m128d b)
596 {
597 return (__m128d)((__v2df)a * (__v2df)b);
598 }
599
_mm_div_sd(__m128d a,__m128d b)600 __INTRIN_INLINE_SSE2 __m128d _mm_div_sd(__m128d a, __m128d b)
601 {
602 a[0] /= b[0];
603 return a;
604 }
605
_mm_div_pd(__m128d a,__m128d b)606 __INTRIN_INLINE_SSE2 __m128d _mm_div_pd(__m128d a, __m128d b)
607 {
608 return (__m128d)((__v2df)a / (__v2df)b);
609 }
610
_mm_sqrt_sd(__m128d a,__m128d b)611 __INTRIN_INLINE_SSE2 __m128d _mm_sqrt_sd(__m128d a, __m128d b)
612 {
613 __m128d __c = __builtin_ia32_sqrtsd((__v2df)b);
614 return __extension__(__m128d){__c[0], a[1]};
615 }
616
_mm_sqrt_pd(__m128d a)617 __INTRIN_INLINE_SSE2 __m128d _mm_sqrt_pd(__m128d a)
618 {
619 return __builtin_ia32_sqrtpd((__v2df)a);
620 }
621
_mm_min_sd(__m128d a,__m128d b)622 __INTRIN_INLINE_SSE2 __m128d _mm_min_sd(__m128d a, __m128d b)
623 {
624 return __builtin_ia32_minsd((__v2df)a, (__v2df)b);
625 }
626
_mm_min_pd(__m128d a,__m128d b)627 __INTRIN_INLINE_SSE2 __m128d _mm_min_pd(__m128d a, __m128d b)
628 {
629 return __builtin_ia32_minpd((__v2df)a, (__v2df)b);
630 }
631
_mm_max_sd(__m128d a,__m128d b)632 __INTRIN_INLINE_SSE2 __m128d _mm_max_sd(__m128d a, __m128d b)
633 {
634 return __builtin_ia32_maxsd((__v2df)a, (__v2df)b);
635 }
636
_mm_max_pd(__m128d a,__m128d b)637 __INTRIN_INLINE_SSE2 __m128d _mm_max_pd(__m128d a, __m128d b)
638 {
639 return __builtin_ia32_maxpd((__v2df)a, (__v2df)b);
640 }
641
_mm_and_pd(__m128d a,__m128d b)642 __INTRIN_INLINE_SSE2 __m128d _mm_and_pd(__m128d a, __m128d b)
643 {
644 return (__m128d)((__v2du)a & (__v2du)b);
645 }
646
_mm_andnot_pd(__m128d a,__m128d b)647 __INTRIN_INLINE_SSE2 __m128d _mm_andnot_pd(__m128d a, __m128d b)
648 {
649 return (__m128d)(~(__v2du)a & (__v2du)b);
650 }
651
_mm_or_pd(__m128d a,__m128d b)652 __INTRIN_INLINE_SSE2 __m128d _mm_or_pd(__m128d a, __m128d b)
653 {
654 return (__m128d)((__v2du)a | (__v2du)b);
655 }
656
_mm_xor_pd(__m128d a,__m128d b)657 __INTRIN_INLINE_SSE2 __m128d _mm_xor_pd(__m128d a, __m128d b)
658 {
659 return (__m128d)((__v2du)a ^ (__v2du)b);
660 }
661
_mm_cmpeq_pd(__m128d a,__m128d b)662 __INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
663 {
664 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)a, (__v2df)b);
665 }
666
_mm_cmplt_pd(__m128d a,__m128d b)667 __INTRIN_INLINE_SSE2 __m128d _mm_cmplt_pd(__m128d a, __m128d b)
668 {
669 return (__m128d)__builtin_ia32_cmpltpd((__v2df)a, (__v2df)b);
670 }
671
_mm_cmple_pd(__m128d a,__m128d b)672 __INTRIN_INLINE_SSE2 __m128d _mm_cmple_pd(__m128d a, __m128d b)
673 {
674 return (__m128d)__builtin_ia32_cmplepd((__v2df)a, (__v2df)b);
675 }
676
_mm_cmpgt_pd(__m128d a,__m128d b)677 __INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
678 {
679 return (__m128d)__builtin_ia32_cmpltpd((__v2df)b, (__v2df)a);
680 }
681
_mm_cmpge_pd(__m128d a,__m128d b)682 __INTRIN_INLINE_SSE2 __m128d _mm_cmpge_pd(__m128d a, __m128d b)
683 {
684 return (__m128d)__builtin_ia32_cmplepd((__v2df)b, (__v2df)a);
685 }
686
_mm_cmpord_pd(__m128d a,__m128d b)687 __INTRIN_INLINE_SSE2 __m128d _mm_cmpord_pd(__m128d a, __m128d b)
688 {
689 return (__m128d)__builtin_ia32_cmpordpd((__v2df)a, (__v2df)b);
690 }
691
_mm_cmpunord_pd(__m128d a,__m128d b)692 __INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
693 {
694 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)a, (__v2df)b);
695 }
696
_mm_cmpneq_pd(__m128d a,__m128d b)697 __INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
698 {
699 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)a, (__v2df)b);
700 }
701
_mm_cmpnlt_pd(__m128d a,__m128d b)702 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
703 {
704 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)a, (__v2df)b);
705 }
706
_mm_cmpnle_pd(__m128d a,__m128d b)707 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
708 {
709 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)a, (__v2df)b);
710 }
711
_mm_cmpngt_pd(__m128d a,__m128d b)712 __INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
713 {
714 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)b, (__v2df)a);
715 }
716
_mm_cmpnge_pd(__m128d a,__m128d b)717 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
718 {
719 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)b, (__v2df)a);
720 }
721
_mm_cmpeq_sd(__m128d a,__m128d b)722 __INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
723 {
724 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)a, (__v2df)b);
725 }
726
_mm_cmplt_sd(__m128d a,__m128d b)727 __INTRIN_INLINE_SSE2 __m128d _mm_cmplt_sd(__m128d a, __m128d b)
728 {
729 return (__m128d)__builtin_ia32_cmpltsd((__v2df)a, (__v2df)b);
730 }
731
_mm_cmple_sd(__m128d a,__m128d b)732 __INTRIN_INLINE_SSE2 __m128d _mm_cmple_sd(__m128d a, __m128d b)
733 {
734 return (__m128d)__builtin_ia32_cmplesd((__v2df)a, (__v2df)b);
735 }
736
_mm_cmpgt_sd(__m128d a,__m128d b)737 __INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
738 {
739 __m128d __c = __builtin_ia32_cmpltsd((__v2df)b, (__v2df)a);
740 return __extension__(__m128d){__c[0], a[1]};
741 }
742
_mm_cmpge_sd(__m128d a,__m128d b)743 __INTRIN_INLINE_SSE2 __m128d _mm_cmpge_sd(__m128d a, __m128d b)
744 {
745 __m128d __c = __builtin_ia32_cmplesd((__v2df)b, (__v2df)a);
746 return __extension__(__m128d){__c[0], a[1]};
747 }
748
_mm_cmpord_sd(__m128d a,__m128d b)749 __INTRIN_INLINE_SSE2 __m128d _mm_cmpord_sd(__m128d a, __m128d b)
750 {
751 return (__m128d)__builtin_ia32_cmpordsd((__v2df)a, (__v2df)b);
752 }
753
_mm_cmpunord_sd(__m128d a,__m128d b)754 __INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
755 {
756 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)a, (__v2df)b);
757 }
758
_mm_cmpneq_sd(__m128d a,__m128d b)759 __INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
760 {
761 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)a, (__v2df)b);
762 }
763
_mm_cmpnlt_sd(__m128d a,__m128d b)764 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
765 {
766 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)a, (__v2df)b);
767 }
768
_mm_cmpnle_sd(__m128d a,__m128d b)769 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
770 {
771 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)a, (__v2df)b);
772 }
773
_mm_cmpngt_sd(__m128d a,__m128d b)774 __INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
775 {
776 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)b, (__v2df)a);
777 return __extension__(__m128d){__c[0], a[1]};
778 }
779
_mm_cmpnge_sd(__m128d a,__m128d b)780 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
781 {
782 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)b, (__v2df)a);
783 return __extension__(__m128d){__c[0], a[1]};
784 }
785
_mm_comieq_sd(__m128d a,__m128d b)786 __INTRIN_INLINE_SSE2 int _mm_comieq_sd(__m128d a, __m128d b)
787 {
788 return __builtin_ia32_comisdeq((__v2df)a, (__v2df)b);
789 }
790
_mm_comilt_sd(__m128d a,__m128d b)791 __INTRIN_INLINE_SSE2 int _mm_comilt_sd(__m128d a, __m128d b)
792 {
793 return __builtin_ia32_comisdlt((__v2df)a, (__v2df)b);
794 }
795
_mm_comile_sd(__m128d a,__m128d b)796 __INTRIN_INLINE_SSE2 int _mm_comile_sd(__m128d a, __m128d b)
797 {
798 return __builtin_ia32_comisdle((__v2df)a, (__v2df)b);
799 }
800
_mm_comigt_sd(__m128d a,__m128d b)801 __INTRIN_INLINE_SSE2 int _mm_comigt_sd(__m128d a, __m128d b)
802 {
803 return __builtin_ia32_comisdgt((__v2df)a, (__v2df)b);
804 }
805
_mm_comige_sd(__m128d a,__m128d b)806 __INTRIN_INLINE_SSE2 int _mm_comige_sd(__m128d a, __m128d b)
807 {
808 return __builtin_ia32_comisdge((__v2df)a, (__v2df)b);
809 }
810
_mm_comineq_sd(__m128d a,__m128d b)811 __INTRIN_INLINE_SSE2 int _mm_comineq_sd(__m128d a, __m128d b)
812 {
813 return __builtin_ia32_comisdneq((__v2df)a, (__v2df)b);
814 }
815
_mm_ucomieq_sd(__m128d a,__m128d b)816 __INTRIN_INLINE_SSE2 int _mm_ucomieq_sd(__m128d a, __m128d b)
817 {
818 return __builtin_ia32_ucomisdeq((__v2df)a, (__v2df)b);
819 }
820
_mm_ucomilt_sd(__m128d a,__m128d b)821 __INTRIN_INLINE_SSE2 int _mm_ucomilt_sd(__m128d a, __m128d b)
822 {
823 return __builtin_ia32_ucomisdlt((__v2df)a, (__v2df)b);
824 }
825
_mm_ucomile_sd(__m128d a,__m128d b)826 __INTRIN_INLINE_SSE2 int _mm_ucomile_sd(__m128d a, __m128d b)
827 {
828 return __builtin_ia32_ucomisdle((__v2df)a, (__v2df)b);
829 }
830
_mm_ucomigt_sd(__m128d a,__m128d b)831 __INTRIN_INLINE_SSE2 int _mm_ucomigt_sd(__m128d a, __m128d b)
832 {
833 return __builtin_ia32_ucomisdgt((__v2df)a, (__v2df)b);
834 }
835
_mm_ucomige_sd(__m128d a,__m128d b)836 __INTRIN_INLINE_SSE2 int _mm_ucomige_sd(__m128d a, __m128d b)
837 {
838 return __builtin_ia32_ucomisdge((__v2df)a, (__v2df)b);
839 }
840
_mm_ucomineq_sd(__m128d a,__m128d b)841 __INTRIN_INLINE_SSE2 int _mm_ucomineq_sd(__m128d a, __m128d b)
842 {
843 return __builtin_ia32_ucomisdneq((__v2df)a, (__v2df)b);
844 }
845
_mm_cvtpd_ps(__m128d a)846 __INTRIN_INLINE_SSE2 __m128 _mm_cvtpd_ps(__m128d a)
847 {
848 return __builtin_ia32_cvtpd2ps((__v2df)a);
849 }
850
_mm_cvtps_pd(__m128 a)851 __INTRIN_INLINE_SSE2 __m128d _mm_cvtps_pd(__m128 a)
852 {
853 #if HAS_BUILTIN(__builtin_convertvector)
854 return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4sf)a, (__v4sf)a, 0, 1), __v2df);
855 #else
856 return __builtin_ia32_cvtps2pd(a);
857 #endif
858 }
859
_mm_cvtepi32_pd(__m128i a)860 __INTRIN_INLINE_SSE2 __m128d _mm_cvtepi32_pd(__m128i a)
861 {
862 #if HAS_BUILTIN(__builtin_convertvector)
863 return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4si)a, (__v4si)a, 0, 1), __v2df);
864 #else
865 return __builtin_ia32_cvtdq2pd((__v4si)a);
866 #endif
867 }
868
_mm_cvtpd_epi32(__m128d a)869 __INTRIN_INLINE_SSE2 __m128i _mm_cvtpd_epi32(__m128d a)
870 {
871 return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)a);
872 }
873
_mm_cvtsd_si32(__m128d a)874 __INTRIN_INLINE_SSE2 int _mm_cvtsd_si32(__m128d a)
875 {
876 return __builtin_ia32_cvtsd2si((__v2df)a);
877 }
878
_mm_cvtsd_ss(__m128 a,__m128d b)879 __INTRIN_INLINE_SSE2 __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
880 {
881 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)a, (__v2df)b);
882 }
883
_mm_cvtsi32_sd(__m128d a,int b)884 __INTRIN_INLINE_SSE2 __m128d _mm_cvtsi32_sd(__m128d a,
885 int b)
886 {
887 a[0] = b;
888 return a;
889 }
890
_mm_cvtss_sd(__m128d a,__m128 b)891 __INTRIN_INLINE_SSE2 __m128d _mm_cvtss_sd(__m128d a, __m128 b)
892 {
893 a[0] = b[0];
894 return a;
895 }
896
_mm_cvttpd_epi32(__m128d a)897 __INTRIN_INLINE_SSE2 __m128i _mm_cvttpd_epi32(__m128d a)
898 {
899 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)a);
900 }
901
_mm_cvttsd_si32(__m128d a)902 __INTRIN_INLINE_SSE2 int _mm_cvttsd_si32(__m128d a)
903 {
904 return __builtin_ia32_cvttsd2si((__v2df)a);
905 }
906
_mm_cvtpd_pi32(__m128d a)907 __INTRIN_INLINE_MMX __m64 _mm_cvtpd_pi32(__m128d a)
908 {
909 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)a);
910 }
911
_mm_cvttpd_pi32(__m128d a)912 __INTRIN_INLINE_MMX __m64 _mm_cvttpd_pi32(__m128d a)
913 {
914 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)a);
915 }
916
_mm_cvtpi32_pd(__m64 a)917 __INTRIN_INLINE_SSE __m128d _mm_cvtpi32_pd(__m64 a)
918 {
919 return __builtin_ia32_cvtpi2pd((__v2si)a);
920 }
921
_mm_cvtsd_f64(__m128d a)922 __INTRIN_INLINE_SSE2 double _mm_cvtsd_f64(__m128d a)
923 {
924 return a[0];
925 }
926
_mm_load_pd(double const * dp)927 __INTRIN_INLINE_SSE2 __m128d _mm_load_pd(double const *dp)
928 {
929 return *(const __m128d *)dp;
930 }
931
_mm_load1_pd(double const * dp)932 __INTRIN_INLINE_SSE2 __m128d _mm_load1_pd(double const *dp)
933 {
934 struct __mm_load1_pd_struct {
935 double __u;
936 } __attribute__((__packed__, __may_alias__));
937 double __u = ((const struct __mm_load1_pd_struct *)dp)->__u;
938 return __extension__(__m128d){__u, __u};
939 }
940
941 // GCC:
942 /* Create a selector for use with the SHUFPD instruction. */
943 #define _MM_SHUFFLE2(fp1,fp0) \
944 (((fp1) << 1) | (fp0))
945
_mm_loadr_pd(double const * dp)946 __INTRIN_INLINE_SSE2 __m128d _mm_loadr_pd(double const *dp)
947 {
948 #if HAS_BUILTIN(__builtin_shufflevector)
949 __m128d u = *(const __m128d *)dp;
950 return __builtin_shufflevector((__v2df)u, (__v2df)u, 1, 0);
951 #else
952 return (__m128d){ dp[1], dp[0] };
953 #endif
954 }
955
_mm_loadu_pd(double const * dp)956 __INTRIN_INLINE_SSE2 __m128d _mm_loadu_pd(double const *dp)
957 {
958 struct __loadu_pd {
959 __m128d_u __v;
960 } __attribute__((__packed__, __may_alias__));
961 return ((const struct __loadu_pd *)dp)->__v;
962 }
963
_mm_loadu_si64(void const * a)964 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si64(void const *a)
965 {
966 struct __loadu_si64 {
967 long long __v;
968 } __attribute__((__packed__, __may_alias__));
969 long long __u = ((const struct __loadu_si64 *)a)->__v;
970 return __extension__(__m128i)(__v2di){__u, 0LL};
971 }
972
_mm_loadu_si32(void const * a)973 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si32(void const *a)
974 {
975 struct __loadu_si32 {
976 int __v;
977 } __attribute__((__packed__, __may_alias__));
978 int __u = ((const struct __loadu_si32 *)a)->__v;
979 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
980 }
981
_mm_loadu_si16(void const * a)982 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si16(void const *a)
983 {
984 struct __loadu_si16 {
985 short __v;
986 } __attribute__((__packed__, __may_alias__));
987 short __u = ((const struct __loadu_si16 *)a)->__v;
988 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
989 }
990
_mm_load_sd(double const * dp)991 __INTRIN_INLINE_SSE2 __m128d _mm_load_sd(double const *dp)
992 {
993 struct __mm_load_sd_struct {
994 double __u;
995 } __attribute__((__packed__, __may_alias__));
996 double __u = ((const struct __mm_load_sd_struct *)dp)->__u;
997 return __extension__(__m128d){__u, 0};
998 }
999
_mm_loadh_pd(__m128d a,double const * dp)1000 __INTRIN_INLINE_SSE2 __m128d _mm_loadh_pd(__m128d a, double const *dp)
1001 {
1002 struct __mm_loadh_pd_struct {
1003 double __u;
1004 } __attribute__((__packed__, __may_alias__));
1005 double __u = ((const struct __mm_loadh_pd_struct *)dp)->__u;
1006 return __extension__(__m128d){a[0], __u};
1007 }
1008
_mm_loadl_pd(__m128d a,double const * dp)1009 __INTRIN_INLINE_SSE2 __m128d _mm_loadl_pd(__m128d a, double const *dp)
1010 {
1011 struct __mm_loadl_pd_struct {
1012 double __u;
1013 } __attribute__((__packed__, __may_alias__));
1014 double __u = ((const struct __mm_loadl_pd_struct *)dp)->__u;
1015 return __extension__(__m128d){__u, a[1]};
1016 }
1017
_mm_undefined_pd(void)1018 __INTRIN_INLINE_SSE2 __m128d _mm_undefined_pd(void)
1019 {
1020 #if HAS_BUILTIN(__builtin_ia32_undef128)
1021 return (__m128d)__builtin_ia32_undef128();
1022 #else
1023 __m128d undef = undef;
1024 return undef;
1025 #endif
1026 }
1027
_mm_set_sd(double w)1028 __INTRIN_INLINE_SSE2 __m128d _mm_set_sd(double w)
1029 {
1030 return __extension__(__m128d){w, 0};
1031 }
1032
_mm_set1_pd(double w)1033 __INTRIN_INLINE_SSE2 __m128d _mm_set1_pd(double w)
1034 {
1035 return __extension__(__m128d){w, w};
1036 }
1037
_mm_set_pd(double w,double x)1038 __INTRIN_INLINE_SSE2 __m128d _mm_set_pd(double w, double x)
1039 {
1040 return __extension__(__m128d){x, w};
1041 }
1042
_mm_setr_pd(double w,double x)1043 __INTRIN_INLINE_SSE2 __m128d _mm_setr_pd(double w, double x)
1044 {
1045 return __extension__(__m128d){w, x};
1046 }
1047
_mm_setzero_pd(void)1048 __INTRIN_INLINE_SSE2 __m128d _mm_setzero_pd(void)
1049 {
1050 return __extension__(__m128d){0, 0};
1051 }
1052
_mm_move_sd(__m128d a,__m128d b)1053 __INTRIN_INLINE_SSE2 __m128d _mm_move_sd(__m128d a, __m128d b)
1054 {
1055 a[0] = b[0];
1056 return a;
1057 }
1058
_mm_store_sd(double * dp,__m128d a)1059 __INTRIN_INLINE_SSE2 void _mm_store_sd(double *dp, __m128d a)
1060 {
1061 struct __mm_store_sd_struct {
1062 double __u;
1063 } __attribute__((__packed__, __may_alias__));
1064 ((struct __mm_store_sd_struct *)dp)->__u = a[0];
1065 }
1066
_mm_store_pd(double * dp,__m128d a)1067 __INTRIN_INLINE_SSE2 void _mm_store_pd(double *dp, __m128d a)
1068 {
1069 *(__m128d *)dp = a;
1070 }
1071
_mm_store1_pd(double * dp,__m128d a)1072 __INTRIN_INLINE_SSE2 void _mm_store1_pd(double *dp, __m128d a)
1073 {
1074 #if HAS_BUILTIN(__builtin_shufflevector)
1075 a = __builtin_shufflevector((__v2df)a, (__v2df)a, 0, 0);
1076 _mm_store_pd(dp, a);
1077 #else
1078 dp[0] = a[0];
1079 dp[1] = a[0];
1080 #endif
1081 }
1082
_mm_storeu_pd(double * dp,__m128d a)1083 __INTRIN_INLINE_SSE2 void _mm_storeu_pd(double *dp, __m128d a)
1084 {
1085 struct __storeu_pd {
1086 __m128d_u __v;
1087 } __attribute__((__packed__, __may_alias__));
1088 ((struct __storeu_pd *)dp)->__v = a;
1089 }
1090
_mm_storer_pd(double * dp,__m128d a)1091 __INTRIN_INLINE_SSE2 void _mm_storer_pd(double *dp, __m128d a)
1092 {
1093 #if HAS_BUILTIN(__builtin_shufflevector)
1094 a = __builtin_shufflevector((__v2df)a, (__v2df)a, 1, 0);
1095 *(__m128d *)dp = a;
1096 #else
1097 dp[0] = a[1];
1098 dp[1] = a[0];
1099 #endif
1100 }
1101
_mm_storeh_pd(double * dp,__m128d a)1102 __INTRIN_INLINE_SSE2 void _mm_storeh_pd(double *dp, __m128d a)
1103 {
1104 struct __mm_storeh_pd_struct {
1105 double __u;
1106 } __attribute__((__packed__, __may_alias__));
1107 ((struct __mm_storeh_pd_struct *)dp)->__u = a[1];
1108 }
1109
_mm_storel_pd(double * dp,__m128d a)1110 __INTRIN_INLINE_SSE2 void _mm_storel_pd(double *dp, __m128d a)
1111 {
1112 struct __mm_storeh_pd_struct {
1113 double __u;
1114 } __attribute__((__packed__, __may_alias__));
1115 ((struct __mm_storeh_pd_struct *)dp)->__u = a[0];
1116 }
1117
_mm_add_epi8(__m128i a,__m128i b)1118 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi8(__m128i a, __m128i b)
1119 {
1120 return (__m128i)((__v16qu)a + (__v16qu)b);
1121 }
1122
_mm_add_epi16(__m128i a,__m128i b)1123 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi16(__m128i a, __m128i b)
1124 {
1125 return (__m128i)((__v8hu)a + (__v8hu)b);
1126 }
1127
_mm_add_epi32(__m128i a,__m128i b)1128 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi32(__m128i a, __m128i b)
1129 {
1130 return (__m128i)((__v4su)a + (__v4su)b);
1131 }
1132
_mm_add_si64(__m64 a,__m64 b)1133 __INTRIN_INLINE_MMX __m64 _mm_add_si64(__m64 a, __m64 b)
1134 {
1135 return (__m64)__builtin_ia32_paddq((__v1di)a, (__v1di)b);
1136 }
1137
_mm_add_epi64(__m128i a,__m128i b)1138 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi64(__m128i a, __m128i b)
1139 {
1140 return (__m128i)((__v2du)a + (__v2du)b);
1141 }
1142
_mm_adds_epi8(__m128i a,__m128i b)1143 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epi8(__m128i a, __m128i b)
1144 {
1145 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1146 return (__m128i)__builtin_elementwise_add_sat((__v16qs)a, (__v16qs)b);
1147 #else
1148 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
1149 #endif
1150 }
1151
_mm_adds_epi16(__m128i a,__m128i b)1152 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epi16(__m128i a, __m128i b)
1153 {
1154 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1155 return (__m128i)__builtin_elementwise_add_sat((__v8hi)a, (__v8hi)b);
1156 #else
1157 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
1158 #endif
1159 }
1160
_mm_adds_epu8(__m128i a,__m128i b)1161 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epu8(__m128i a, __m128i b)
1162 {
1163 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1164 return (__m128i)__builtin_elementwise_add_sat((__v16qu)a, (__v16qu)b);
1165 #else
1166 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
1167 #endif
1168 }
1169
_mm_adds_epu16(__m128i a,__m128i b)1170 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epu16(__m128i a, __m128i b)
1171 {
1172 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1173 return (__m128i)__builtin_elementwise_add_sat((__v8hu)a, (__v8hu)b);
1174 #else
1175 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
1176 #endif
1177 }
1178
_mm_avg_epu8(__m128i a,__m128i b)1179 __INTRIN_INLINE_SSE2 __m128i _mm_avg_epu8(__m128i a, __m128i b)
1180 {
1181 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
1182 }
1183
_mm_avg_epu16(__m128i a,__m128i b)1184 __INTRIN_INLINE_SSE2 __m128i _mm_avg_epu16(__m128i a, __m128i b)
1185 {
1186 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
1187 }
1188
_mm_madd_epi16(__m128i a,__m128i b)1189 __INTRIN_INLINE_SSE2 __m128i _mm_madd_epi16(__m128i a, __m128i b)
1190 {
1191 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
1192 }
1193
_mm_max_epi16(__m128i a,__m128i b)1194 __INTRIN_INLINE_SSE2 __m128i _mm_max_epi16(__m128i a, __m128i b)
1195 {
1196 #if HAS_BUILTIN(__builtin_elementwise_max)
1197 return (__m128i)__builtin_elementwise_max((__v8hi)a, (__v8hi)b);
1198 #else
1199 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
1200 #endif
1201 }
1202
_mm_max_epu8(__m128i a,__m128i b)1203 __INTRIN_INLINE_SSE2 __m128i _mm_max_epu8(__m128i a, __m128i b)
1204 {
1205 #if HAS_BUILTIN(__builtin_elementwise_max)
1206 return (__m128i)__builtin_elementwise_max((__v16qu)a, (__v16qu)b);
1207 #else
1208 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
1209 #endif
1210 }
1211
_mm_min_epi16(__m128i a,__m128i b)1212 __INTRIN_INLINE_SSE2 __m128i _mm_min_epi16(__m128i a, __m128i b)
1213 {
1214 #if HAS_BUILTIN(__builtin_elementwise_min)
1215 return (__m128i)__builtin_elementwise_min((__v8hi)a, (__v8hi)b);
1216 #else
1217 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
1218 #endif
1219 }
1220
_mm_min_epu8(__m128i a,__m128i b)1221 __INTRIN_INLINE_SSE2 __m128i _mm_min_epu8(__m128i a, __m128i b)
1222 {
1223 #if HAS_BUILTIN(__builtin_elementwise_min)
1224 return (__m128i)__builtin_elementwise_min((__v16qu)a, (__v16qu)b);
1225 #else
1226 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
1227 #endif
1228 }
1229
_mm_mulhi_epi16(__m128i a,__m128i b)1230 __INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
1231 {
1232 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
1233 }
1234
_mm_mulhi_epu16(__m128i a,__m128i b)1235 __INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
1236 {
1237 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
1238 }
1239
_mm_mullo_epi16(__m128i a,__m128i b)1240 __INTRIN_INLINE_SSE2 __m128i _mm_mullo_epi16(__m128i a, __m128i b)
1241 {
1242 return (__m128i)((__v8hu)a * (__v8hu)b);
1243 }
1244
_mm_mul_su32(__m64 a,__m64 b)1245 __INTRIN_INLINE_MMX __m64 _mm_mul_su32(__m64 a, __m64 b)
1246 {
1247 return (__m64)__builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
1248 }
1249
_mm_mul_epu32(__m128i a,__m128i b)1250 __INTRIN_INLINE_SSE2 __m128i _mm_mul_epu32(__m128i a, __m128i b)
1251 {
1252 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
1253 }
1254
_mm_sad_epu8(__m128i a,__m128i b)1255 __INTRIN_INLINE_SSE2 __m128i _mm_sad_epu8(__m128i a, __m128i b)
1256 {
1257 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
1258 }
1259
_mm_sub_epi8(__m128i a,__m128i b)1260 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi8(__m128i a, __m128i b)
1261 {
1262 return (__m128i)((__v16qu)a - (__v16qu)b);
1263 }
1264
_mm_sub_epi16(__m128i a,__m128i b)1265 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi16(__m128i a, __m128i b)
1266 {
1267 return (__m128i)((__v8hu)a - (__v8hu)b);
1268 }
1269
_mm_sub_epi32(__m128i a,__m128i b)1270 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi32(__m128i a, __m128i b)
1271 {
1272 return (__m128i)((__v4su)a - (__v4su)b);
1273 }
1274
_mm_sub_si64(__m64 a,__m64 b)1275 __INTRIN_INLINE_MMX __m64 _mm_sub_si64(__m64 a, __m64 b)
1276 {
1277 return (__m64)__builtin_ia32_psubq((__v1di)a, (__v1di)b);
1278 }
1279
_mm_sub_epi64(__m128i a,__m128i b)1280 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi64(__m128i a, __m128i b)
1281 {
1282 return (__m128i)((__v2du)a - (__v2du)b);
1283 }
1284
_mm_subs_epi8(__m128i a,__m128i b)1285 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epi8(__m128i a, __m128i b)
1286 {
1287 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1288 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)a, (__v16qs)b);
1289 #else
1290 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
1291 #endif
1292 }
1293
_mm_subs_epi16(__m128i a,__m128i b)1294 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epi16(__m128i a, __m128i b)
1295 {
1296 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1297 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)a, (__v8hi)b);
1298 #else
1299 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
1300 #endif
1301 }
1302
_mm_subs_epu8(__m128i a,__m128i b)1303 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epu8(__m128i a, __m128i b)
1304 {
1305 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1306 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)a, (__v16qu)b);
1307 #else
1308 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
1309 #endif
1310 }
1311
_mm_subs_epu16(__m128i a,__m128i b)1312 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epu16(__m128i a, __m128i b)
1313 {
1314 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1315 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)a, (__v8hu)b);
1316 #else
1317 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
1318 #endif
1319 }
1320
_mm_and_si128(__m128i a,__m128i b)1321 __INTRIN_INLINE_SSE2 __m128i _mm_and_si128(__m128i a, __m128i b)
1322 {
1323 return (__m128i)((__v2du)a & (__v2du)b);
1324 }
1325
_mm_andnot_si128(__m128i a,__m128i b)1326 __INTRIN_INLINE_SSE2 __m128i _mm_andnot_si128(__m128i a, __m128i b)
1327 {
1328 return (__m128i)(~(__v2du)a & (__v2du)b);
1329 }
1330
_mm_or_si128(__m128i a,__m128i b)1331 __INTRIN_INLINE_SSE2 __m128i _mm_or_si128(__m128i a, __m128i b)
1332 {
1333 return (__m128i)((__v2du)a | (__v2du)b);
1334 }
1335
_mm_xor_si128(__m128i a,__m128i b)1336 __INTRIN_INLINE_SSE2 __m128i _mm_xor_si128(__m128i a, __m128i b)
1337 {
1338 return (__m128i)((__v2du)a ^ (__v2du)b);
1339 }
1340
1341 #define _mm_slli_si128(a, imm) \
1342 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
1343
_mm_slli_epi16(__m128i a,int count)1344 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi16(__m128i a, int count)
1345 {
1346 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
1347 }
1348
_mm_sll_epi16(__m128i a,__m128i count)1349 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi16(__m128i a, __m128i count)
1350 {
1351 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
1352 }
1353
_mm_slli_epi32(__m128i a,int count)1354 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi32(__m128i a, int count)
1355 {
1356 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
1357 }
1358
_mm_sll_epi32(__m128i a,__m128i count)1359 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi32(__m128i a, __m128i count)
1360 {
1361 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
1362 }
1363
_mm_slli_epi64(__m128i a,int count)1364 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi64(__m128i a, int count)
1365 {
1366 return __builtin_ia32_psllqi128((__v2di)a, count);
1367 }
1368
_mm_sll_epi64(__m128i a,__m128i count)1369 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi64(__m128i a, __m128i count)
1370 {
1371 return __builtin_ia32_psllq128((__v2di)a, (__v2di)count);
1372 }
1373
_mm_srai_epi16(__m128i a,int count)1374 __INTRIN_INLINE_SSE2 __m128i _mm_srai_epi16(__m128i a, int count)
1375 {
1376 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
1377 }
1378
_mm_sra_epi16(__m128i a,__m128i count)1379 __INTRIN_INLINE_SSE2 __m128i _mm_sra_epi16(__m128i a, __m128i count)
1380 {
1381 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
1382 }
1383
_mm_srai_epi32(__m128i a,int count)1384 __INTRIN_INLINE_SSE2 __m128i _mm_srai_epi32(__m128i a, int count)
1385 {
1386 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
1387 }
1388
_mm_sra_epi32(__m128i a,__m128i count)1389 __INTRIN_INLINE_SSE2 __m128i _mm_sra_epi32(__m128i a, __m128i count)
1390 {
1391 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
1392 }
1393
1394 #define _mm_srli_si128(a, imm) \
1395 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
1396
_mm_srli_epi16(__m128i a,int count)1397 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi16(__m128i a, int count)
1398 {
1399 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
1400 }
1401
_mm_srl_epi16(__m128i a,__m128i count)1402 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi16(__m128i a, __m128i count)
1403 {
1404 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
1405 }
1406
_mm_srli_epi32(__m128i a,int count)1407 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi32(__m128i a, int count)
1408 {
1409 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
1410 }
1411
_mm_srl_epi32(__m128i a,__m128i count)1412 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi32(__m128i a, __m128i count)
1413 {
1414 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
1415 }
1416
_mm_srli_epi64(__m128i a,int count)1417 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi64(__m128i a, int count)
1418 {
1419 return __builtin_ia32_psrlqi128((__v2di)a, count);
1420 }
1421
_mm_srl_epi64(__m128i a,__m128i count)1422 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi64(__m128i a, __m128i count)
1423 {
1424 return __builtin_ia32_psrlq128((__v2di)a, (__v2di)count);
1425 }
1426
_mm_cmpeq_epi8(__m128i a,__m128i b)1427 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
1428 {
1429 return (__m128i)((__v16qi)a == (__v16qi)b);
1430 }
1431
_mm_cmpeq_epi16(__m128i a,__m128i b)1432 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
1433 {
1434 return (__m128i)((__v8hi)a == (__v8hi)b);
1435 }
1436
_mm_cmpeq_epi32(__m128i a,__m128i b)1437 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
1438 {
1439 return (__m128i)((__v4si)a == (__v4si)b);
1440 }
1441
_mm_cmpgt_epi8(__m128i a,__m128i b)1442 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
1443 {
1444 /* This function always performs a signed comparison, but __v16qi is a char
1445 which may be signed or unsigned, so use __v16qs. */
1446 return (__m128i)((__v16qs)a > (__v16qs)b);
1447 }
1448
_mm_cmpgt_epi16(__m128i a,__m128i b)1449 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
1450 {
1451 return (__m128i)((__v8hi)a > (__v8hi)b);
1452 }
1453
_mm_cmpgt_epi32(__m128i a,__m128i b)1454 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
1455 {
1456 return (__m128i)((__v4si)a > (__v4si)b);
1457 }
1458
_mm_cmplt_epi8(__m128i a,__m128i b)1459 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
1460 {
1461 return _mm_cmpgt_epi8(b, a);
1462 }
1463
_mm_cmplt_epi16(__m128i a,__m128i b)1464 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
1465 {
1466 return _mm_cmpgt_epi16(b, a);
1467 }
1468
_mm_cmplt_epi32(__m128i a,__m128i b)1469 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
1470 {
1471 return _mm_cmpgt_epi32(b, a);
1472 }
1473
1474 #ifdef _M_AMD64
1475
_mm_cvtsi64_sd(__m128d a,long long b)1476 __INTRIN_INLINE_SSE2 __m128d _mm_cvtsi64_sd(__m128d a, long long b)
1477 {
1478 a[0] = b;
1479 return a;
1480 }
1481
_mm_cvtsd_si64(__m128d a)1482 __INTRIN_INLINE_SSE2 long long _mm_cvtsd_si64(__m128d a)
1483 {
1484 return __builtin_ia32_cvtsd2si64((__v2df)a);
1485 }
1486
_mm_cvttsd_si64(__m128d a)1487 __INTRIN_INLINE_SSE2 long long _mm_cvttsd_si64(__m128d a)
1488 {
1489 return __builtin_ia32_cvttsd2si64((__v2df)a);
1490 }
1491 #endif
1492
_mm_cvtepi32_ps(__m128i a)1493 __INTRIN_INLINE_SSE2 __m128 _mm_cvtepi32_ps(__m128i a)
1494 {
1495 #if HAS_BUILTIN(__builtin_convertvector)
1496 return (__m128)__builtin_convertvector((__v4si)a, __v4sf);
1497 #else
1498 return __builtin_ia32_cvtdq2ps((__v4si)a);
1499 #endif
1500 }
1501
_mm_cvtps_epi32(__m128 a)1502 __INTRIN_INLINE_SSE2 __m128i _mm_cvtps_epi32(__m128 a)
1503 {
1504 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)a);
1505 }
1506
_mm_cvttps_epi32(__m128 a)1507 __INTRIN_INLINE_SSE2 __m128i _mm_cvttps_epi32(__m128 a)
1508 {
1509 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)a);
1510 }
1511
_mm_cvtsi32_si128(int a)1512 __INTRIN_INLINE_SSE2 __m128i _mm_cvtsi32_si128(int a)
1513 {
1514 return __extension__(__m128i)(__v4si){a, 0, 0, 0};
1515 }
1516
_mm_cvtsi64_si128(long long a)1517 __INTRIN_INLINE_SSE2 __m128i _mm_cvtsi64_si128(long long a)
1518 {
1519 return __extension__(__m128i)(__v2di){a, 0};
1520 }
1521
_mm_cvtsi128_si32(__m128i a)1522 __INTRIN_INLINE_SSE2 int _mm_cvtsi128_si32(__m128i a)
1523 {
1524 __v4si b = (__v4si)a;
1525 return b[0];
1526 }
1527
_mm_cvtsi128_si64(__m128i a)1528 __INTRIN_INLINE_SSE2 long long _mm_cvtsi128_si64(__m128i a)
1529 {
1530 return a[0];
1531 }
1532
_mm_load_si128(__m128i const * p)1533 __INTRIN_INLINE_SSE2 __m128i _mm_load_si128(__m128i const *p)
1534 {
1535 return *p;
1536 }
1537
_mm_loadu_si128(__m128i_u const * p)1538 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si128(__m128i_u const *p)
1539 {
1540 struct __loadu_si128 {
1541 __m128i_u __v;
1542 } __attribute__((__packed__, __may_alias__));
1543 return ((const struct __loadu_si128 *)p)->__v;
1544 }
1545
_mm_loadl_epi64(__m128i_u const * p)1546 __INTRIN_INLINE_SSE2 __m128i _mm_loadl_epi64(__m128i_u const *p)
1547 {
1548 struct __mm_loadl_epi64_struct {
1549 long long __u;
1550 } __attribute__((__packed__, __may_alias__));
1551 return __extension__(__m128i){
1552 ((const struct __mm_loadl_epi64_struct *)p)->__u, 0};
1553 }
1554
_mm_undefined_si128(void)1555 __INTRIN_INLINE_SSE2 __m128i _mm_undefined_si128(void)
1556 {
1557 #if HAS_BUILTIN(__builtin_ia32_undef128)
1558 return (__m128i)__builtin_ia32_undef128();
1559 #else
1560 __m128i undef = undef;
1561 return undef;
1562 #endif
1563 }
1564
_mm_set_epi64x(long long q1,long long q0)1565 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi64x(long long q1, long long q0)
1566 {
1567 return __extension__(__m128i)(__v2di){q0, q1};
1568 }
1569
_mm_set_epi64(__m64 q1,__m64 q0)1570 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi64(__m64 q1, __m64 q0)
1571 {
1572 return _mm_set_epi64x((long long)q1, (long long)q0);
1573 }
1574
_mm_set_epi32(int i3,int i2,int i1,int i0)1575 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
1576 {
1577 return __extension__(__m128i)(__v4si){i0, i1, i2, i3};
1578 }
1579
_mm_set_epi16(short w7,short w6,short w5,short w4,short w3,short w2,short w1,short w0)1580 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi16(
1581 short w7, short w6, short w5, short w4,
1582 short w3, short w2, short w1, short w0)
1583 {
1584 return __extension__(__m128i)(__v8hi){w0, w1, w2, w3, w4, w5, w6, w7};
1585 }
1586
_mm_set_epi8(char b15,char b14,char b13,char b12,char b11,char b10,char b9,char b8,char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0)1587 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi8(
1588 char b15, char b14, char b13, char b12,
1589 char b11, char b10, char b9, char b8,
1590 char b7, char b6, char b5, char b4,
1591 char b3, char b2, char b1, char b0)
1592 {
1593 return __extension__(__m128i)(__v16qi){
1594 b0, b1, b2, b3, b4, b5, b6, b7,
1595 b8, b9, b10, b11, b12, b13, b14, b15};
1596 }
1597
_mm_set1_epi64x(long long q)1598 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64x(long long q)
1599 {
1600 return _mm_set_epi64x(q, q);
1601 }
1602
_mm_set1_epi64(__m64 q)1603 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64(__m64 q)
1604 {
1605 return _mm_set_epi64(q, q);
1606 }
1607
_mm_set1_epi32(int i)1608 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi32(int i)
1609 {
1610 return _mm_set_epi32(i, i, i, i);
1611 }
1612
_mm_set1_epi16(short w)1613 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi16(short w)
1614 {
1615 return _mm_set_epi16(w, w, w, w, w, w, w, w);
1616 }
1617
_mm_set1_epi8(char b)1618 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi8(char b)
1619 {
1620 return _mm_set_epi8(b, b, b, b, b, b, b, b, b, b, b,
1621 b, b, b, b, b);
1622 }
1623
_mm_setr_epi64(__m64 q0,__m64 q1)1624 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi64(__m64 q0, __m64 q1)
1625 {
1626 return _mm_set_epi64(q1, q0);
1627 }
1628
_mm_setr_epi32(int i0,int i1,int i2,int i3)1629 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3)
1630 {
1631 return _mm_set_epi32(i3, i2, i1, i0);
1632 }
1633
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)1634 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi16(
1635 short w0, short w1, short w2, short w3,
1636 short w4, short w5, short w6, short w7)
1637 {
1638 return _mm_set_epi16(w7, w6, w5, w4, w3, w2, w1, w0);
1639 }
1640
_mm_setr_epi8(char b0,char b1,char b2,char b3,char b4,char b5,char b6,char b7,char b8,char b9,char b10,char b11,char b12,char b13,char b14,char b15)1641 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi8(
1642 char b0, char b1, char b2, char b3,
1643 char b4, char b5, char b6, char b7,
1644 char b8, char b9, char b10, char b11,
1645 char b12, char b13, char b14, char b15)
1646 {
1647 return _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8,
1648 b7, b6, b5, b4, b3, b2, b1, b0);
1649 }
1650
_mm_setzero_si128(void)1651 __INTRIN_INLINE_SSE2 __m128i _mm_setzero_si128(void)
1652 {
1653 return __extension__(__m128i)(__v2di){0LL, 0LL};
1654 }
1655
_mm_store_si128(__m128i * p,__m128i b)1656 __INTRIN_INLINE_SSE2 void _mm_store_si128(__m128i *p, __m128i b)
1657 {
1658 *p = b;
1659 }
1660
_mm_storeu_si128(__m128i_u * p,__m128i b)1661 __INTRIN_INLINE_SSE2 void _mm_storeu_si128(__m128i_u *p, __m128i b)
1662 {
1663 struct __storeu_si128 {
1664 __m128i_u __v;
1665 } __attribute__((__packed__, __may_alias__));
1666 ((struct __storeu_si128 *)p)->__v = b;
1667 }
1668
_mm_storeu_si64(void * p,__m128i b)1669 __INTRIN_INLINE_SSE2 void _mm_storeu_si64(void *p, __m128i b)
1670 {
1671 struct __storeu_si64 {
1672 long long __v;
1673 } __attribute__((__packed__, __may_alias__));
1674 ((struct __storeu_si64 *)p)->__v = ((__v2di)b)[0];
1675 }
1676
_mm_storeu_si32(void * p,__m128i b)1677 __INTRIN_INLINE_SSE2 void _mm_storeu_si32(void *p, __m128i b)
1678 {
1679 struct __storeu_si32 {
1680 int __v;
1681 } __attribute__((__packed__, __may_alias__));
1682 ((struct __storeu_si32 *)p)->__v = ((__v4si)b)[0];
1683 }
1684
_mm_storeu_si16(void * p,__m128i b)1685 __INTRIN_INLINE_SSE2 void _mm_storeu_si16(void *p, __m128i b)
1686 {
1687 struct __storeu_si16 {
1688 short __v;
1689 } __attribute__((__packed__, __may_alias__));
1690 ((struct __storeu_si16 *)p)->__v = ((__v8hi)b)[0];
1691 }
1692
_mm_maskmoveu_si128(__m128i d,__m128i n,char * p)1693 __INTRIN_INLINE_SSE2 void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1694 {
1695 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1696 }
1697
_mm_storel_epi64(__m128i_u * p,__m128i a)1698 __INTRIN_INLINE_SSE2 void _mm_storel_epi64(__m128i_u *p, __m128i a)
1699 {
1700 struct __mm_storel_epi64_struct {
1701 long long __u;
1702 } __attribute__((__packed__, __may_alias__));
1703 ((struct __mm_storel_epi64_struct *)p)->__u = a[0];
1704 }
1705
_mm_stream_pd(double * p,__m128d a)1706 __INTRIN_INLINE_SSE2 void _mm_stream_pd(double *p, __m128d a)
1707 {
1708 #if HAS_BUILTIN(__builtin_nontemporal_store)
1709 __builtin_nontemporal_store((__v2df)a, (__v2df *)p);
1710 #else
1711 __builtin_ia32_movntpd(p, a);
1712 #endif
1713 }
1714
_mm_stream_si128(__m128i * p,__m128i a)1715 __INTRIN_INLINE_SSE2 void _mm_stream_si128(__m128i *p, __m128i a)
1716 {
1717 #if HAS_BUILTIN(__builtin_nontemporal_store)
1718 __builtin_nontemporal_store((__v2di)a, (__v2di*)p);
1719 #else
1720 __builtin_ia32_movntdq(p, a);
1721 #endif
1722 }
1723
_mm_stream_si32(int * p,int a)1724 __INTRIN_INLINE_SSE2 void _mm_stream_si32(int *p, int a)
1725 {
1726 __builtin_ia32_movnti(p, a);
1727 }
1728
1729 #ifdef _M_AMD64
_mm_stream_si64(long long * p,long long a)1730 __INTRIN_INLINE_SSE2 void _mm_stream_si64(long long *p, long long a)
1731 {
1732 __builtin_ia32_movnti64(p, a);
1733 }
1734 #endif
1735
1736 void _mm_clflush(void const *p);
1737
1738 void _mm_lfence(void);
1739
1740 void _mm_mfence(void);
1741
_mm_packs_epi16(__m128i a,__m128i b)1742 __INTRIN_INLINE_SSE2 __m128i _mm_packs_epi16(__m128i a, __m128i b)
1743 {
1744 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1745 }
1746
_mm_packs_epi32(__m128i a,__m128i b)1747 __INTRIN_INLINE_SSE2 __m128i _mm_packs_epi32(__m128i a, __m128i b)
1748 {
1749 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1750 }
1751
_mm_packus_epi16(__m128i a,__m128i b)1752 __INTRIN_INLINE_SSE2 __m128i _mm_packus_epi16(__m128i a, __m128i b)
1753 {
1754 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1755 }
1756
1757 #define _mm_extract_epi16(a, imm) \
1758 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
1759 (int)(imm)))
1760
1761 #define _mm_insert_epi16(a, b, imm) \
1762 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
1763 (int)(imm)))
1764
_mm_movemask_epi8(__m128i a)1765 __INTRIN_INLINE_SSE2 int _mm_movemask_epi8(__m128i a)
1766 {
1767 return __builtin_ia32_pmovmskb128((__v16qi)a);
1768 }
1769
1770 #define _mm_shuffle_epi32(a, imm) \
1771 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
1772
1773 #define _mm_shufflelo_epi16(a, imm) \
1774 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
1775
1776 #define _mm_shufflehi_epi16(a, imm) \
1777 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
1778
_mm_unpackhi_epi8(__m128i a,__m128i b)1779 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
1780 {
1781 #if HAS_BUILTIN(__builtin_shufflevector)
1782 return (__m128i)__builtin_shufflevector(
1783 (__v16qi)a, (__v16qi)b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
1784 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
1785 #else
1786 return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)a, (__v16qi)b);
1787 #endif
1788 }
1789
_mm_unpackhi_epi16(__m128i a,__m128i b)1790 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
1791 {
1792 #if HAS_BUILTIN(__builtin_shufflevector)
1793 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8 + 4, 5,
1794 8 + 5, 6, 8 + 6, 7, 8 + 7);
1795 #else
1796 return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)a, (__v8hi)b);
1797 #endif
1798 }
1799
_mm_unpackhi_epi32(__m128i a,__m128i b)1800 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
1801 {
1802 #if HAS_BUILTIN(__builtin_shufflevector)
1803 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4 + 2, 3,
1804 4 + 3);
1805 #else
1806 return (__m128i)__builtin_ia32_punpckhdq128((__v4si)a, (__v4si)b);
1807 #endif
1808 }
1809
_mm_unpackhi_epi64(__m128i a,__m128i b)1810 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
1811 {
1812 #if HAS_BUILTIN(__builtin_shufflevector)
1813 return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 1, 2 + 1);
1814 #else
1815 return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)a, (__v2di)b);
1816 #endif
1817 }
1818
_mm_unpacklo_epi8(__m128i a,__m128i b)1819 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
1820 {
1821 #if HAS_BUILTIN(__builtin_shufflevector)
1822 return (__m128i)__builtin_shufflevector(
1823 (__v16qi)a, (__v16qi)b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
1824 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
1825 #else
1826 return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)a, (__v16qi)b);
1827 #endif
1828 }
1829
_mm_unpacklo_epi16(__m128i a,__m128i b)1830 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
1831 {
1832 #if HAS_BUILTIN(__builtin_shufflevector)
1833 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8 + 0, 1,
1834 8 + 1, 2, 8 + 2, 3, 8 + 3);
1835 #else
1836 return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)a, (__v8hi)b);
1837 #endif
1838 }
1839
_mm_unpacklo_epi32(__m128i a,__m128i b)1840 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
1841 {
1842 #if HAS_BUILTIN(__builtin_shufflevector)
1843 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4 + 0, 1,
1844 4 + 1);
1845 #else
1846 return (__m128i)__builtin_ia32_punpckldq128((__v4si)a, (__v4si)b);
1847 #endif
1848 }
1849
_mm_unpacklo_epi64(__m128i a,__m128i b)1850 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
1851 {
1852 #if HAS_BUILTIN(__builtin_shufflevector)
1853 return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 0, 2 + 0);
1854 #else
1855 return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)a, (__v2di)b);
1856 #endif
1857 }
1858
_mm_movepi64_pi64(__m128i a)1859 __INTRIN_INLINE_SSE2 __m64 _mm_movepi64_pi64(__m128i a)
1860 {
1861 return (__m64)a[0];
1862 }
1863
_mm_movpi64_epi64(__m64 a)1864 __INTRIN_INLINE_SSE2 __m128i _mm_movpi64_epi64(__m64 a)
1865 {
1866 return __extension__(__m128i)(__v2di){(long long)a, 0};
1867 }
1868
_mm_move_epi64(__m128i a)1869 __INTRIN_INLINE_SSE2 __m128i _mm_move_epi64(__m128i a)
1870 {
1871 #if HAS_BUILTIN(__builtin_shufflevector)
1872 return __builtin_shufflevector((__v2di)a, _mm_setzero_si128(), 0, 2);
1873 #else
1874 return (__m128i)__builtin_ia32_movq128((__v2di)a);
1875 #endif
1876 }
1877
_mm_unpackhi_pd(__m128d a,__m128d b)1878 __INTRIN_INLINE_SSE2 __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
1879 {
1880 #if HAS_BUILTIN(__builtin_shufflevector)
1881 return __builtin_shufflevector((__v2df)a, (__v2df)b, 1, 2 + 1);
1882 #else
1883 return (__m128d)__builtin_ia32_unpckhpd((__v2df)a, (__v2df)b);
1884 #endif
1885 }
1886
_mm_unpacklo_pd(__m128d a,__m128d b)1887 __INTRIN_INLINE_SSE2 __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
1888 {
1889 #if HAS_BUILTIN(__builtin_shufflevector)
1890 return __builtin_shufflevector((__v2df)a, (__v2df)b, 0, 2 + 0);
1891 #else
1892 return (__m128d)__builtin_ia32_unpcklpd((__v2df)a, (__v2df)b);
1893 #endif
1894 }
1895
_mm_movemask_pd(__m128d a)1896 __INTRIN_INLINE_SSE2 int _mm_movemask_pd(__m128d a)
1897 {
1898 return __builtin_ia32_movmskpd((__v2df)a);
1899 }
1900
1901 #define _mm_shuffle_pd(a, b, i) \
1902 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
1903 (int)(i)))
1904
_mm_castpd_ps(__m128d a)1905 __INTRIN_INLINE_SSE2 __m128 _mm_castpd_ps(__m128d a)
1906 {
1907 return (__m128)a;
1908 }
1909
_mm_castpd_si128(__m128d a)1910 __INTRIN_INLINE_SSE2 __m128i _mm_castpd_si128(__m128d a)
1911 {
1912 return (__m128i)a;
1913 }
1914
_mm_castps_pd(__m128 a)1915 __INTRIN_INLINE_SSE2 __m128d _mm_castps_pd(__m128 a)
1916 {
1917 return (__m128d)a;
1918 }
1919
_mm_castps_si128(__m128 a)1920 __INTRIN_INLINE_SSE2 __m128i _mm_castps_si128(__m128 a)
1921 {
1922 return (__m128i)a;
1923 }
1924
_mm_castsi128_ps(__m128i a)1925 __INTRIN_INLINE_SSE2 __m128 _mm_castsi128_ps(__m128i a)
1926 {
1927 return (__m128)a;
1928 }
1929
_mm_castsi128_pd(__m128i a)1930 __INTRIN_INLINE_SSE2 __m128d _mm_castsi128_pd(__m128i a)
1931 {
1932 return (__m128d)a;
1933 }
1934
1935 void _mm_pause(void);
1936
1937 #endif /* _MSC_VER */
1938
1939
1940
1941 #endif /* _INCLUDED_EMM */
1942