xref: /reactos/sdk/include/vcruntime/xmmintrin.h (revision 84344399)
1 /*
2  * xmmintrin.h
3  *
4  * This file is part of the ReactOS CRT package.
5  *
6  * Contributors:
7  *   Timo Kreuzer (timo.kreuzer@reactos.org)
8  *
9  * THIS SOFTWARE IS NOT COPYRIGHTED
10  *
11  * This source code is offered for use in the public domain. You may
12  * use, modify or distribute it freely.
13  *
14  * This code is distributed in the hope that it will be useful but
15  * WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY
16  * DISCLAIMED. This includes but is not limited to warranties of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18  *
19  */
20 
21 #pragma once
22 #ifndef _INCLUDED_MM2
23 #define _INCLUDED_MM2
24 
25 #include <mmintrin.h>
26 
27 #if defined(_MM2_FUNCTIONALITY) && !defined(_MM_FUNCTIONALITY)
28 #define _MM_FUNCTIONALITY
29 #endif
30 
31 #if !defined _VCRT_BUILD && !defined _INC_MALLOC
32 #include <malloc.h> // For _mm_malloc() and _mm_free()
33 #endif
34 
35 #ifdef __cplusplus
36 extern "C" {
37 #endif
38 
39 #if defined(_MSC_VER) && !defined(__clang__)
40 
41 typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128
42 {
43     float m128_f32[4];
44     unsigned __int64 m128_u64[2];
45     __int8 m128_i8[16];
46     __int16 m128_i16[8];
47     __int32 m128_i32[4];
48     __int64 m128_i64[2];
49     unsigned __int8 m128_u8[16];
50     unsigned __int16 m128_u16[8];
51     unsigned __int32 m128_u32[4];
52 } __m128;
53 
54 #define __ATTRIBUTE_SSE__
55 
56 #else /* _MSC_VER */
57 
58     typedef        float __v4sf __attribute__((__vector_size__(16)));
59     typedef   signed int __v4si __attribute__((__vector_size__(16)));
60     typedef unsigned int __v4su __attribute__((__vector_size__(16)));
61     typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
62 
63     typedef        float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
64 
65 #ifdef __clang__
66 #define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"),__min_vector_width__(128)))
67 #else
68 #define __ATTRIBUTE_SSE__ __attribute__((__target__("sse")))
69 #endif
70 #define __INTRIN_INLINE_SSE __INTRIN_INLINE __ATTRIBUTE_SSE__
71 
72 #endif /* _MSC_VER */
73 
74 #define _MM_ALIGN16 _VCRT_ALIGN(16)
75 
76 /* Constants for use with _mm_prefetch.  */
77 #define _MM_HINT_NTA  0
78 #define _MM_HINT_T0   1
79 #define _MM_HINT_T1   2
80 #define _MM_HINT_T2   3
81 #define _MM_HINT_ENTA 4
82 #if 0 // Not supported yet
83 #define _MM_HINT_ET0  5
84 #define _MM_HINT_ET1  6
85 #define _MM_HINT_ET2  7
86 #endif
87 
88 /* Create a selector for use with the SHUFPS instruction.  */
89 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
90     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
91 
92 /* Bits in the MXCSR.  */
93 #define _MM_EXCEPT_MASK       0x003f
94 #define _MM_EXCEPT_INVALID    0x0001
95 #define _MM_EXCEPT_DENORM     0x0002
96 #define _MM_EXCEPT_DIV_ZERO   0x0004
97 #define _MM_EXCEPT_OVERFLOW   0x0008
98 #define _MM_EXCEPT_UNDERFLOW  0x0010
99 #define _MM_EXCEPT_INEXACT    0x0020
100 
101 #define _MM_MASK_MASK         0x1f80
102 #define _MM_MASK_INVALID      0x0080
103 #define _MM_MASK_DENORM       0x0100
104 #define _MM_MASK_DIV_ZERO     0x0200
105 #define _MM_MASK_OVERFLOW     0x0400
106 #define _MM_MASK_UNDERFLOW    0x0800
107 #define _MM_MASK_INEXACT      0x1000
108 
109 #define _MM_ROUND_MASK        0x6000
110 #define _MM_ROUND_NEAREST     0x0000
111 #define _MM_ROUND_DOWN        0x2000
112 #define _MM_ROUND_UP          0x4000
113 #define _MM_ROUND_TOWARD_ZERO 0x6000
114 
115 #define _MM_FLUSH_ZERO_MASK   0x8000
116 #define _MM_FLUSH_ZERO_ON     0x8000
117 #define _MM_FLUSH_ZERO_OFF    0x0000
118 
119 #ifdef __ICL
120 void* __cdecl _mm_malloc(size_t Size, size_t Al);
121 void __cdecl _mm_free(void* P);
122 #endif
123 
124 void _mm_prefetch(_In_ char const* p, _In_ int i);
125 __m128 _mm_setzero_ps(void);
126 __m128 _mm_add_ss(__m128 a, __m128 b);
127 __m128 _mm_sub_ss(__m128 a, __m128 b);
128 __m128 _mm_mul_ss(__m128 a, __m128 b);
129 __m128 _mm_div_ss(__m128 a, __m128 b);
130 __m128 _mm_sqrt_ss(__m128 a);
131 __m128 _mm_rcp_ss(__m128 a);
132 __m128 _mm_rsqrt_ss(__m128 a);
133 __m128 _mm_min_ss(__m128 a, __m128 b);
134 __m128 _mm_max_ss(__m128 a, __m128 b);
135 __m128 _mm_add_ps(__m128 a, __m128 b);
136 __m128 _mm_sub_ps(__m128 a, __m128 b);
137 __m128 _mm_mul_ps(__m128 a, __m128 b);
138 __m128 _mm_div_ps(__m128 a, __m128 b);
139 __m128 _mm_sqrt_ps(__m128 a);
140 __m128 _mm_rcp_ps(__m128 a);
141 __m128 _mm_rsqrt_ps(__m128 a);
142 __m128 _mm_min_ps(__m128 a, __m128 b);
143 __m128 _mm_max_ps(__m128 a, __m128 b);
144 __m128 _mm_and_ps(__m128 a, __m128 b);
145 __m128 _mm_andnot_ps(__m128 a, __m128 b);
146 __m128 _mm_or_ps(__m128 a, __m128 b);
147 __m128 _mm_xor_ps(__m128 a, __m128 b);
148 __m128 _mm_cmpeq_ss(__m128 a, __m128 b);
149 __m128 _mm_cmplt_ss(__m128 a, __m128 b);
150 __m128 _mm_cmple_ss(__m128 a, __m128 b);
151 __m128 _mm_cmpgt_ss(__m128 a, __m128 b);
152 __m128 _mm_cmpge_ss(__m128 a, __m128 b);
153 __m128 _mm_cmpneq_ss(__m128 a, __m128 b);
154 __m128 _mm_cmpnlt_ss(__m128 a, __m128 b);
155 __m128 _mm_cmpnle_ss(__m128 a, __m128 b);
156 __m128 _mm_cmpngt_ss(__m128 a, __m128 b);
157 __m128 _mm_cmpnge_ss(__m128 a, __m128 b);
158 __m128 _mm_cmpord_ss(__m128 a, __m128 b);
159 __m128 _mm_cmpunord_ss(__m128 a, __m128 b);
160 __m128 _mm_cmpeq_ps(__m128 a, __m128 b);
161 __m128 _mm_cmplt_ps(__m128 a, __m128 b);
162 __m128 _mm_cmple_ps(__m128 a, __m128 b);
163 __m128 _mm_cmpgt_ps(__m128 a, __m128 b);
164 __m128 _mm_cmpge_ps(__m128 a, __m128 b);
165 __m128 _mm_cmpneq_ps(__m128 a, __m128 b);
166 __m128 _mm_cmpnlt_ps(__m128 a, __m128 b);
167 __m128 _mm_cmpnle_ps(__m128 a, __m128 b);
168 __m128 _mm_cmpngt_ps(__m128 a, __m128 b);
169 __m128 _mm_cmpnge_ps(__m128 a, __m128 b);
170 __m128 _mm_cmpord_ps(__m128 a, __m128 b);
171 __m128 _mm_cmpunord_ps(__m128 a, __m128 b);
172 int _mm_comieq_ss(__m128 a, __m128 b);
173 int _mm_comilt_ss(__m128 a, __m128 b);
174 int _mm_comile_ss(__m128 a, __m128 b);
175 int _mm_comigt_ss(__m128 a, __m128 b);
176 int _mm_comige_ss(__m128 a, __m128 b);
177 int _mm_comineq_ss(__m128 a, __m128 b);
178 int _mm_ucomieq_ss(__m128 a, __m128 b);
179 int _mm_ucomilt_ss(__m128 a, __m128 b);
180 int _mm_ucomile_ss(__m128 a, __m128 b);
181 int _mm_ucomigt_ss(__m128 a, __m128 b);
182 int _mm_ucomige_ss(__m128 a, __m128 b);
183 int _mm_ucomineq_ss(__m128 a, __m128 b);
184 int _mm_cvt_ss2si(__m128 a);
185 int _mm_cvtt_ss2si(__m128 a);
186 __m128 _mm_cvt_si2ss(__m128 a, int b);
187 #ifdef _M_IX86
188 __m64 _mm_cvt_ps2pi(__m128 a);
189 __m64 _mm_cvtt_ps2pi(__m128 a);
190 __m128 _mm_cvt_pi2ps(__m128 a, __m64 b);
191 #endif
192 __m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8);
193 __m128 _mm_unpackhi_ps(__m128 a, __m128 b);
194 __m128 _mm_unpacklo_ps(__m128 a, __m128 b);
195 __m128 _mm_loadh_pi(__m128 a, __m64 const* p);
196 void _mm_storeh_pi(__m64* p, __m128 a);
197 __m128 _mm_movehl_ps(__m128 a, __m128 b);
198 __m128 _mm_movelh_ps(__m128 a, __m128 b);
199 __m128 _mm_loadl_pi(__m128 a, __m64 const* p);
200 void _mm_storel_pi(__m64* p, __m128 a);
201 int _mm_movemask_ps(__m128 a);
202 unsigned int _mm_getcsr(void);
203 void _mm_setcsr(unsigned int a);
204 __m128 _mm_set_ss(float a);
205 __m128 _mm_set_ps1(float a);
206 __m128 _mm_load_ss(float const* p);
207 __m128 _mm_load_ps1(float const* p);
208 __m128 _mm_load_ps(float const* p);
209 __m128 _mm_loadu_ps(float const* p);
210 __m128 _mm_loadr_ps(float const* p);
211 __m128 _mm_set_ps(float e3, float e2, float e1, float e0);
212 __m128 _mm_setr_ps(float e3, float e2, float e1, float e0);
213 void _mm_store_ss(float* p, __m128 a);
214 float _mm_cvtss_f32(__m128 a);
215 void _mm_store_ps(float* p, __m128 a);
216 void _mm_storeu_ps(float* p, __m128 a);
217 void _mm_store_ps1(float* p, __m128 a);
218 void _mm_storer_ps(float* p, __m128 a);
219 __m128 _mm_move_ss(__m128 a, __m128 b);
220 #ifdef _M_IX86
221 int _m_pextrw(__m64 a, int imm8);
222 __m64 _m_pinsrw(__m64 a, int i, int imm8);
223 __m64 _m_pmaxsw(__m64 a, __m64 b);
224 __m64 _m_pmaxub(__m64 a, __m64 b);
225 __m64 _m_pminsw(__m64 a, __m64 b);
226 __m64 _m_pminub(__m64 a, __m64 b);
227 int _m_pmovmskb(__m64 a);
228 __m64 _m_pmulhuw(__m64 a, __m64 b);
229 __m64 _m_pshufw(__m64 a, int imm8);
230 void _m_maskmovq(__m64 a, __m64 b, char*);
231 __m64 _m_pavgb(__m64 a, __m64 b);
232 __m64 _m_pavgw(__m64 a, __m64 b);
233 __m64 _m_psadbw(__m64 a, __m64 b);
234 void _mm_stream_pi(__m64* p, __m64 a);
235 #endif
236 void _mm_stream_ps(float* p, __m128 a);
237 void _mm_sfence(void);
238 #ifdef _M_AMD64
239 __int64 _mm_cvtss_si64(__m128 a);
240 __int64 _mm_cvttss_si64(__m128 a);
241 __m128  _mm_cvtsi64_ss(__m128 a, __int64 b);
242 #endif
243 
244 /* Alternate names */
245 #define _mm_cvtss_si32 _mm_cvt_ss2si
246 #define _mm_cvttss_si32 _mm_cvtt_ss2si
247 #define _mm_cvtsi32_ss _mm_cvt_si2ss
248 #define _mm_set1_ps _mm_set_ps1
249 #define _mm_load1_ps _mm_load_ps1f
250 #define _mm_store1_ps _mm_store_ps1
251 #define _mm_cvtps_pi32    _mm_cvt_ps2pi
252 #define _mm_cvttps_pi32   _mm_cvtt_ps2pi
253 #define _mm_cvtpi32_ps    _mm_cvt_pi2ps
254 #define _mm_extract_pi16  _m_pextrw
255 #define _mm_insert_pi16   _m_pinsrw
256 #define _mm_max_pi16      _m_pmaxsw
257 #define _mm_max_pu8       _m_pmaxub
258 #define _mm_min_pi16      _m_pminsw
259 #define _mm_min_pu8       _m_pminub
260 #define _mm_movemask_pi8  _m_pmovmskb
261 #define _mm_mulhi_pu16    _m_pmulhuw
262 #define _mm_shuffle_pi16  _m_pshufw
263 #define _mm_maskmove_si64 _m_maskmovq
264 #define _mm_avg_pu8       _m_pavgb
265 #define _mm_avg_pu16      _m_pavgw
266 #define _mm_sad_pu8       _m_psadbw
267 
268 #ifdef _M_IX86
269 /* Inline functions from Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h */
270 
271 __ATTRIBUTE_SSE__
_mm_cvtpi16_ps(__m64 __a)272 static __inline __m128 _mm_cvtpi16_ps(__m64 __a)
273 {
274     __m64 __b, __c;
275     __m128 __r;
276 
277     __b = _mm_setzero_si64();
278     __b = _mm_cmpgt_pi16(__b, __a);
279     __c = _mm_unpackhi_pi16(__a, __b);
280     __r = _mm_setzero_ps();
281     __r = _mm_cvtpi32_ps(__r, __c);
282     __r = _mm_movelh_ps(__r, __r);
283     __c = _mm_unpacklo_pi16(__a, __b);
284     __r = _mm_cvtpi32_ps(__r, __c);
285 
286     return __r;
287 }
288 
289 __ATTRIBUTE_SSE__
_mm_cvtpu16_ps(__m64 __a)290 static __inline __m128 _mm_cvtpu16_ps(__m64 __a)
291 {
292     __m64 __b, __c;
293     __m128 __r;
294 
295     __b = _mm_setzero_si64();
296     __c = _mm_unpackhi_pi16(__a, __b);
297     __r = _mm_setzero_ps();
298     __r = _mm_cvtpi32_ps(__r, __c);
299     __r = _mm_movelh_ps(__r, __r);
300     __c = _mm_unpacklo_pi16(__a, __b);
301     __r = _mm_cvtpi32_ps(__r, __c);
302 
303     return __r;
304 }
305 
306 __ATTRIBUTE_SSE__
_mm_cvtpi8_ps(__m64 __a)307 static __inline __m128 _mm_cvtpi8_ps(__m64 __a)
308 {
309     __m64 __b;
310 
311     __b = _mm_setzero_si64();
312     __b = _mm_cmpgt_pi8(__b, __a);
313     __b = _mm_unpacklo_pi8(__a, __b);
314 
315     return _mm_cvtpi16_ps(__b);
316 }
317 
318 __ATTRIBUTE_SSE__
_mm_cvtpu8_ps(__m64 __a)319 static __inline __m128 _mm_cvtpu8_ps(__m64 __a)
320 {
321     __m64 __b;
322 
323     __b = _mm_setzero_si64();
324     __b = _mm_unpacklo_pi8(__a, __b);
325 
326     return _mm_cvtpi16_ps(__b);
327 }
328 
329 __ATTRIBUTE_SSE__
_mm_cvtpi32x2_ps(__m64 __a,__m64 __b)330 static __inline __m128 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
331 {
332     __m128 __c;
333 
334     __c = _mm_setzero_ps();
335     __c = _mm_cvtpi32_ps(__c, __b);
336     __c = _mm_movelh_ps(__c, __c);
337 
338     return _mm_cvtpi32_ps(__c, __a);
339 }
340 
341 __ATTRIBUTE_SSE__
_mm_cvtps_pi16(__m128 __a)342 static __inline __m64 _mm_cvtps_pi16(__m128 __a)
343 {
344     __m64 __b, __c;
345 
346     __b = _mm_cvtps_pi32(__a);
347     __a = _mm_movehl_ps(__a, __a);
348     __c = _mm_cvtps_pi32(__a);
349 
350     return _mm_packs_pi32(__b, __c);
351 }
352 
353 __ATTRIBUTE_SSE__
_mm_cvtps_pi8(__m128 __a)354 static __inline __m64 _mm_cvtps_pi8(__m128 __a)
355 {
356     __m64 __b, __c;
357 
358     __b = _mm_cvtps_pi16(__a);
359     __c = _mm_setzero_si64();
360 
361     return _mm_packs_pi16(__b, __c);
362 }
363 
364 #endif /* _M_IX86 */
365 
366 /* Transpose the 4x4 matrix composed of row[0-3].  */
367 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
368 do {                                              \
369     __m128 t0 = _mm_unpacklo_ps(row0, row1);      \
370     __m128 t1 = _mm_unpacklo_ps(row2, row3);      \
371     __m128 t2 = _mm_unpackhi_ps(row0, row1);      \
372     __m128 t3 = _mm_unpackhi_ps(row2, row3);      \
373     (row0) = _mm_movelh_ps(t0, t1);               \
374     (row1) = _mm_movehl_ps(t1, t0);               \
375     (row2) = _mm_movelh_ps(t2, t3);               \
376     (row3) = _mm_movehl_ps(t3, t2);               \
377 } while (0)
378 
379 #define _MM_GET_EXCEPTION_STATE() \
380     (_mm_getcsr() & _MM_EXCEPT_MASK)
381 
382 #define _MM_GET_EXCEPTION_MASK() \
383     (_mm_getcsr() & _MM_MASK_MASK)
384 
385 #define _MM_GET_ROUNDING_MODE() \
386     (_mm_getcsr() & _MM_ROUND_MASK)
387 
388 #define _MM_GET_FLUSH_ZERO_MODE() \
389     (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
390 
391 #define _MM_SET_EXCEPTION_STATE(__mask) \
392     _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (__mask))
393 
394 #define _MM_SET_EXCEPTION_MASK(__mask) \
395     _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (__mask))
396 
397 #define _MM_SET_ROUNDING_MODE(__mode) \
398     _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (__mode))
399 
400 #define _MM_SET_FLUSH_ZERO_MODE(__mode) \
401     _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (__mode))
402 
403 /* Use intrinsics on MSVC */
404 #if defined(_MSC_VER) && !defined(__clang__)
405 #pragma intrinsic(_mm_prefetch)
406 #pragma intrinsic(_mm_setzero_ps)
407 #pragma intrinsic(_mm_add_ss)
408 #pragma intrinsic(_mm_sub_ss)
409 #pragma intrinsic(_mm_mul_ss)
410 #pragma intrinsic(_mm_div_ss)
411 #pragma intrinsic(_mm_sqrt_ss)
412 #pragma intrinsic(_mm_rcp_ss)
413 #pragma intrinsic(_mm_rsqrt_ss)
414 #pragma intrinsic(_mm_min_ss)
415 #pragma intrinsic(_mm_max_ss)
416 #pragma intrinsic(_mm_add_ps)
417 #pragma intrinsic(_mm_sub_ps)
418 #pragma intrinsic(_mm_mul_ps)
419 #pragma intrinsic(_mm_div_ps)
420 #pragma intrinsic(_mm_sqrt_ps)
421 #pragma intrinsic(_mm_rcp_ps)
422 #pragma intrinsic(_mm_rsqrt_ps)
423 #pragma intrinsic(_mm_min_ps)
424 #pragma intrinsic(_mm_max_ps)
425 #pragma intrinsic(_mm_and_ps)
426 #pragma intrinsic(_mm_andnot_ps)
427 #pragma intrinsic(_mm_or_ps)
428 #pragma intrinsic(_mm_xor_ps)
429 #pragma intrinsic(_mm_cmpeq_ss)
430 #pragma intrinsic(_mm_cmplt_ss)
431 #pragma intrinsic(_mm_cmple_ss)
432 #pragma intrinsic(_mm_cmpgt_ss)
433 #pragma intrinsic(_mm_cmpge_ss)
434 #pragma intrinsic(_mm_cmpneq_ss)
435 #pragma intrinsic(_mm_cmpnlt_ss)
436 #pragma intrinsic(_mm_cmpnle_ss)
437 #pragma intrinsic(_mm_cmpngt_ss)
438 #pragma intrinsic(_mm_cmpnge_ss)
439 #pragma intrinsic(_mm_cmpord_ss)
440 #pragma intrinsic(_mm_cmpunord_ss)
441 #pragma intrinsic(_mm_cmpeq_ps)
442 #pragma intrinsic(_mm_cmplt_ps)
443 #pragma intrinsic(_mm_cmple_ps)
444 #pragma intrinsic(_mm_cmpgt_ps)
445 #pragma intrinsic(_mm_cmpge_ps)
446 #pragma intrinsic(_mm_cmpneq_ps)
447 #pragma intrinsic(_mm_cmpnlt_ps)
448 #pragma intrinsic(_mm_cmpnle_ps)
449 #pragma intrinsic(_mm_cmpngt_ps)
450 #pragma intrinsic(_mm_cmpnge_ps)
451 #pragma intrinsic(_mm_cmpord_ps)
452 #pragma intrinsic(_mm_cmpunord_ps)
453 #pragma intrinsic(_mm_comieq_ss)
454 #pragma intrinsic(_mm_comilt_ss)
455 #pragma intrinsic(_mm_comile_ss)
456 #pragma intrinsic(_mm_comigt_ss)
457 #pragma intrinsic(_mm_comige_ss)
458 #pragma intrinsic(_mm_comineq_ss)
459 #pragma intrinsic(_mm_ucomieq_ss)
460 #pragma intrinsic(_mm_ucomilt_ss)
461 #pragma intrinsic(_mm_ucomile_ss)
462 #pragma intrinsic(_mm_ucomigt_ss)
463 #pragma intrinsic(_mm_ucomige_ss)
464 #pragma intrinsic(_mm_ucomineq_ss)
465 #pragma intrinsic(_mm_cvt_ss2si)
466 #pragma intrinsic(_mm_cvtt_ss2si)
467 #pragma intrinsic(_mm_cvt_si2ss)
468 #ifdef _M_IX86
469 #pragma intrinsic(_mm_cvt_ps2pi)
470 #pragma intrinsic(_mm_cvtt_ps2pi)
471 #pragma intrinsic(_mm_cvt_pi2ps)
472 #endif // _M_IX86
473 #pragma intrinsic(_mm_shuffle_ps)
474 #pragma intrinsic(_mm_unpackhi_ps)
475 #pragma intrinsic(_mm_unpacklo_ps)
476 #pragma intrinsic(_mm_loadh_pi)
477 #pragma intrinsic(_mm_storeh_pi)
478 #pragma intrinsic(_mm_movehl_ps)
479 #pragma intrinsic(_mm_movelh_ps)
480 #pragma intrinsic(_mm_loadl_pi)
481 #pragma intrinsic(_mm_storel_pi)
482 #pragma intrinsic(_mm_movemask_ps)
483 #pragma intrinsic(_mm_getcsr)
484 #pragma intrinsic(_mm_setcsr)
485 #pragma intrinsic(_mm_set_ss)
486 #pragma intrinsic(_mm_set_ps1)
487 #pragma intrinsic(_mm_load_ss)
488 #pragma intrinsic(_mm_load_ps1)
489 #pragma intrinsic(_mm_load_ps)
490 #pragma intrinsic(_mm_loadu_ps)
491 #pragma intrinsic(_mm_loadr_ps)
492 #pragma intrinsic(_mm_set_ps)
493 #pragma intrinsic(_mm_setr_ps)
494 #pragma intrinsic(_mm_store_ss)
495 #pragma intrinsic(_mm_cvtss_f32)
496 #pragma intrinsic(_mm_store_ps)
497 #pragma intrinsic(_mm_storeu_ps)
498 #pragma intrinsic(_mm_store_ps1)
499 #pragma intrinsic(_mm_storer_ps)
500 #pragma intrinsic(_mm_move_ss)
501 #ifdef _M_IX86
502 #pragma intrinsic(_m_pextrw)
503 #pragma intrinsic(_m_pinsrw)
504 #pragma intrinsic(_m_pmaxsw)
505 #pragma intrinsic(_m_pmaxub)
506 #pragma intrinsic(_m_pminsw)
507 #pragma intrinsic(_m_pminub)
508 #pragma intrinsic(_m_pmovmskb)
509 #pragma intrinsic(_m_pmulhuw)
510 #pragma intrinsic(_m_pshufw)
511 #pragma intrinsic(_m_maskmovq)
512 #pragma intrinsic(_m_pavgb)
513 #pragma intrinsic(_m_pavgw)
514 #pragma intrinsic(_m_psadbw)
515 #pragma intrinsic(_mm_stream_pi)
516 #endif // _M_IX86
517 #pragma intrinsic(_mm_stream_ps)
518 #pragma intrinsic(_mm_sfence)
519 #ifdef _M_AMD64
520 #pragma intrinsic(_mm_cvtss_si64)
521 #pragma intrinsic(_mm_cvttss_si64)
522 #pragma intrinsic(_mm_cvtsi64_ss)
523 #endif // _M_AMD64
524 
525 #else /* _MSC_VER */
526 
527 /*
528   GCC: https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/xmmintrin.h
529   Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h
530 */
531 
532 /* Use inline functions on GCC/Clang */
533 
534 #if !HAS_BUILTIN(_mm_getcsr)
_mm_getcsr(void)535 __INTRIN_INLINE_SSE unsigned int _mm_getcsr(void)
536 {
537     return __builtin_ia32_stmxcsr();
538 }
539 #endif
540 
541 #if !HAS_BUILTIN(_mm_setcsr)
_mm_setcsr(unsigned int a)542 __INTRIN_INLINE_SSE void _mm_setcsr(unsigned int a)
543 {
544     __builtin_ia32_ldmxcsr(a);
545 }
546 #endif
547 
_mm_add_ss(__m128 __a,__m128 __b)548 __INTRIN_INLINE_SSE __m128 _mm_add_ss(__m128 __a, __m128 __b)
549 {
550     __a[0] += __b[0];
551     return __a;
552 }
553 
_mm_add_ps(__m128 __a,__m128 __b)554 __INTRIN_INLINE_SSE __m128 _mm_add_ps(__m128 __a, __m128 __b)
555 {
556     return (__m128)((__v4sf)__a + (__v4sf)__b);
557 }
558 
_mm_sub_ss(__m128 __a,__m128 __b)559 __INTRIN_INLINE_SSE __m128 _mm_sub_ss(__m128 __a, __m128 __b)
560 {
561     __a[0] -= __b[0];
562     return __a;
563 }
564 
_mm_sub_ps(__m128 __a,__m128 __b)565 __INTRIN_INLINE_SSE __m128 _mm_sub_ps(__m128 __a, __m128 __b)
566 {
567     return (__m128)((__v4sf)__a - (__v4sf)__b);
568 }
569 
_mm_mul_ss(__m128 __a,__m128 __b)570 __INTRIN_INLINE_SSE __m128 _mm_mul_ss(__m128 __a, __m128 __b)
571 {
572     __a[0] *= __b[0];
573     return __a;
574 }
575 
_mm_mul_ps(__m128 __a,__m128 __b)576 __INTRIN_INLINE_SSE __m128 _mm_mul_ps(__m128 __a, __m128 __b)
577 {
578     return (__m128)((__v4sf)__a * (__v4sf)__b);
579 }
580 
_mm_div_ss(__m128 __a,__m128 __b)581 __INTRIN_INLINE_SSE __m128 _mm_div_ss(__m128 __a, __m128 __b)
582 {
583     __a[0] /= __b[0];
584     return __a;
585 }
586 
_mm_div_ps(__m128 __a,__m128 __b)587 __INTRIN_INLINE_SSE __m128 _mm_div_ps(__m128 __a, __m128 __b)
588 {
589     return (__m128)((__v4sf)__a / (__v4sf)__b);
590 }
591 
_mm_sqrt_ss(__m128 __a)592 __INTRIN_INLINE_SSE __m128 _mm_sqrt_ss(__m128 __a)
593 {
594     return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
595 }
596 
_mm_sqrt_ps(__m128 __a)597 __INTRIN_INLINE_SSE __m128 _mm_sqrt_ps(__m128 __a)
598 {
599     return __builtin_ia32_sqrtps((__v4sf)__a);
600 }
601 
_mm_rcp_ss(__m128 __a)602 __INTRIN_INLINE_SSE __m128 _mm_rcp_ss(__m128 __a)
603 {
604     return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
605 }
606 
_mm_rcp_ps(__m128 __a)607 __INTRIN_INLINE_SSE __m128 _mm_rcp_ps(__m128 __a)
608 {
609     return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
610 }
611 
_mm_rsqrt_ss(__m128 __a)612 __INTRIN_INLINE_SSE __m128 _mm_rsqrt_ss(__m128 __a)
613 {
614     return __builtin_ia32_rsqrtss((__v4sf)__a);
615 }
616 
_mm_rsqrt_ps(__m128 __a)617 __INTRIN_INLINE_SSE __m128 _mm_rsqrt_ps(__m128 __a)
618 {
619     return __builtin_ia32_rsqrtps((__v4sf)__a);
620 }
621 
_mm_min_ss(__m128 __a,__m128 __b)622 __INTRIN_INLINE_SSE __m128 _mm_min_ss(__m128 __a, __m128 __b)
623 {
624     return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
625 }
626 
_mm_min_ps(__m128 __a,__m128 __b)627 __INTRIN_INLINE_SSE __m128 _mm_min_ps(__m128 __a, __m128 __b)
628 {
629     return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
630 }
631 
_mm_max_ss(__m128 __a,__m128 __b)632 __INTRIN_INLINE_SSE __m128 _mm_max_ss(__m128 __a, __m128 __b)
633 {
634     return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
635 }
636 
_mm_max_ps(__m128 __a,__m128 __b)637 __INTRIN_INLINE_SSE __m128 _mm_max_ps(__m128 __a, __m128 __b)
638 {
639     return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
640 }
641 
_mm_and_ps(__m128 __a,__m128 __b)642 __INTRIN_INLINE_SSE __m128 _mm_and_ps(__m128 __a, __m128 __b)
643 {
644     return (__m128)((__v4su)__a & (__v4su)__b);
645 }
646 
_mm_andnot_ps(__m128 __a,__m128 __b)647 __INTRIN_INLINE_SSE __m128 _mm_andnot_ps(__m128 __a, __m128 __b)
648 {
649     return (__m128)(~(__v4su)__a & (__v4su)__b);
650 }
651 
_mm_or_ps(__m128 __a,__m128 __b)652 __INTRIN_INLINE_SSE __m128 _mm_or_ps(__m128 __a, __m128 __b)
653 {
654     return (__m128)((__v4su)__a | (__v4su)__b);
655 }
656 
_mm_xor_ps(__m128 __a,__m128 __b)657 __INTRIN_INLINE_SSE __m128 _mm_xor_ps(__m128 __a, __m128 __b)
658 {
659     return (__m128)((__v4su)__a ^ (__v4su)__b);
660 }
661 
_mm_cmpeq_ss(__m128 __a,__m128 __b)662 __INTRIN_INLINE_SSE __m128 _mm_cmpeq_ss(__m128 __a, __m128 __b)
663 {
664     return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
665 }
666 
_mm_cmpeq_ps(__m128 __a,__m128 __b)667 __INTRIN_INLINE_SSE __m128 _mm_cmpeq_ps(__m128 __a, __m128 __b)
668 {
669     return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
670 }
671 
_mm_cmplt_ss(__m128 __a,__m128 __b)672 __INTRIN_INLINE_SSE __m128 _mm_cmplt_ss(__m128 __a, __m128 __b)
673 {
674     return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
675 }
676 
_mm_cmplt_ps(__m128 __a,__m128 __b)677 __INTRIN_INLINE_SSE __m128 _mm_cmplt_ps(__m128 __a, __m128 __b)
678 {
679     return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
680 }
681 
_mm_cmple_ss(__m128 __a,__m128 __b)682 __INTRIN_INLINE_SSE __m128 _mm_cmple_ss(__m128 __a, __m128 __b)
683 {
684     return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
685 }
686 
_mm_cmple_ps(__m128 __a,__m128 __b)687 __INTRIN_INLINE_SSE __m128 _mm_cmple_ps(__m128 __a, __m128 __b)
688 {
689     return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
690 }
691 
_mm_cmpgt_ss(__m128 __a,__m128 __b)692 __INTRIN_INLINE_SSE __m128 _mm_cmpgt_ss(__m128 __a, __m128 __b)
693 {
694     __v4sf temp = __builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a);
695 #ifdef __clang__
696     return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
697 #else
698     return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
699 #endif
700 }
701 
_mm_cmpgt_ps(__m128 __a,__m128 __b)702 __INTRIN_INLINE_SSE __m128 _mm_cmpgt_ps(__m128 __a, __m128 __b)
703 {
704     return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
705 }
706 
_mm_cmpge_ss(__m128 __a,__m128 __b)707 __INTRIN_INLINE_SSE __m128 _mm_cmpge_ss(__m128 __a, __m128 __b)
708 {
709     __v4sf temp = __builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a);
710 #ifdef __clang__
711     return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
712 #else
713     return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
714 #endif
715 }
716 
_mm_cmpge_ps(__m128 __a,__m128 __b)717 __INTRIN_INLINE_SSE __m128 _mm_cmpge_ps(__m128 __a, __m128 __b)
718 {
719     return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
720 }
721 
_mm_cmpneq_ss(__m128 __a,__m128 __b)722 __INTRIN_INLINE_SSE __m128 _mm_cmpneq_ss(__m128 __a, __m128 __b)
723 {
724     return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
725 }
726 
_mm_cmpneq_ps(__m128 __a,__m128 __b)727 __INTRIN_INLINE_SSE __m128 _mm_cmpneq_ps(__m128 __a, __m128 __b)
728 {
729     return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
730 }
731 
_mm_cmpnlt_ss(__m128 __a,__m128 __b)732 __INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
733 {
734     return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
735 }
736 
_mm_cmpnlt_ps(__m128 __a,__m128 __b)737 __INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
738 {
739     return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
740 }
741 
_mm_cmpnle_ss(__m128 __a,__m128 __b)742 __INTRIN_INLINE_SSE __m128 _mm_cmpnle_ss(__m128 __a, __m128 __b)
743 {
744     return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
745 }
746 
_mm_cmpnle_ps(__m128 __a,__m128 __b)747 __INTRIN_INLINE_SSE __m128 _mm_cmpnle_ps(__m128 __a, __m128 __b)
748 {
749     return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
750 }
751 
_mm_cmpngt_ss(__m128 __a,__m128 __b)752 __INTRIN_INLINE_SSE __m128 _mm_cmpngt_ss(__m128 __a, __m128 __b)
753 {
754     __v4sf temp = __builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a);
755 #ifdef  __clang__
756     return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
757 #else
758     return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
759 #endif
760 }
761 
_mm_cmpngt_ps(__m128 __a,__m128 __b)762 __INTRIN_INLINE_SSE __m128 _mm_cmpngt_ps(__m128 __a, __m128 __b)
763 {
764     return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
765 }
766 
_mm_cmpnge_ss(__m128 __a,__m128 __b)767 __INTRIN_INLINE_SSE __m128 _mm_cmpnge_ss(__m128 __a, __m128 __b)
768 {
769     __v4sf temp = (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a);
770 #ifdef  __clang__
771     return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
772 #else
773     return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
774 #endif
775 }
776 
_mm_cmpnge_ps(__m128 __a,__m128 __b)777 __INTRIN_INLINE_SSE __m128 _mm_cmpnge_ps(__m128 __a, __m128 __b)
778 {
779     return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
780 }
781 
_mm_cmpord_ss(__m128 __a,__m128 __b)782 __INTRIN_INLINE_SSE __m128 _mm_cmpord_ss(__m128 __a, __m128 __b)
783 {
784     return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
785 }
786 
_mm_cmpord_ps(__m128 __a,__m128 __b)787 __INTRIN_INLINE_SSE __m128 _mm_cmpord_ps(__m128 __a, __m128 __b)
788 {
789     return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
790 }
791 
_mm_cmpunord_ss(__m128 __a,__m128 __b)792 __INTRIN_INLINE_SSE __m128 _mm_cmpunord_ss(__m128 __a, __m128 __b)
793 {
794     return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
795 }
796 
_mm_cmpunord_ps(__m128 __a,__m128 __b)797 __INTRIN_INLINE_SSE __m128 _mm_cmpunord_ps(__m128 __a, __m128 __b)
798 {
799     return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
800 }
801 
_mm_comieq_ss(__m128 __a,__m128 __b)802 __INTRIN_INLINE_SSE int _mm_comieq_ss(__m128 __a, __m128 __b)
803 {
804     return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
805 }
806 
_mm_comilt_ss(__m128 __a,__m128 __b)807 __INTRIN_INLINE_SSE int _mm_comilt_ss(__m128 __a, __m128 __b)
808 {
809     return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
810 }
811 
_mm_comile_ss(__m128 __a,__m128 __b)812 __INTRIN_INLINE_SSE int _mm_comile_ss(__m128 __a, __m128 __b)
813 {
814     return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
815 }
816 
_mm_comigt_ss(__m128 __a,__m128 __b)817 __INTRIN_INLINE_SSE int _mm_comigt_ss(__m128 __a, __m128 __b)
818 {
819     return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
820 }
821 
_mm_comige_ss(__m128 __a,__m128 __b)822 __INTRIN_INLINE_SSE int _mm_comige_ss(__m128 __a, __m128 __b)
823 {
824     return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
825 }
826 
_mm_comineq_ss(__m128 __a,__m128 __b)827 __INTRIN_INLINE_SSE int _mm_comineq_ss(__m128 __a, __m128 __b)
828 {
829     return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
830 }
831 
_mm_ucomieq_ss(__m128 __a,__m128 __b)832 __INTRIN_INLINE_SSE int _mm_ucomieq_ss(__m128 __a, __m128 __b)
833 {
834     return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
835 }
836 
_mm_ucomilt_ss(__m128 __a,__m128 __b)837 __INTRIN_INLINE_SSE int _mm_ucomilt_ss(__m128 __a, __m128 __b)
838 {
839     return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
840 }
841 
_mm_ucomile_ss(__m128 __a,__m128 __b)842 __INTRIN_INLINE_SSE int _mm_ucomile_ss(__m128 __a, __m128 __b)
843 {
844     return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
845 }
846 
_mm_ucomigt_ss(__m128 __a,__m128 __b)847 __INTRIN_INLINE_SSE int _mm_ucomigt_ss(__m128 __a, __m128 __b)
848 {
849     return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
850 }
851 
_mm_ucomige_ss(__m128 __a,__m128 __b)852 __INTRIN_INLINE_SSE int _mm_ucomige_ss(__m128 __a, __m128 __b)
853 {
854     return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
855 }
856 
_mm_ucomineq_ss(__m128 __a,__m128 __b)857 __INTRIN_INLINE_SSE int _mm_ucomineq_ss(__m128 __a, __m128 __b)
858 {
859     return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
860 }
861 
862 // _mm_cvt_ss2si
_mm_cvtss_si32(__m128 __a)863 __INTRIN_INLINE_SSE int _mm_cvtss_si32(__m128 __a)
864 {
865     return __builtin_ia32_cvtss2si((__v4sf)__a);
866 }
867 
868 #ifdef _M_AMD64
_mm_cvtss_si64(__m128 __a)869 __INTRIN_INLINE_SSE long long _mm_cvtss_si64(__m128 __a)
870 {
871     return __builtin_ia32_cvtss2si64((__v4sf)__a);
872 }
873 #endif
874 
875 // _mm_cvt_ps2pi
_mm_cvtps_pi32(__m128 __a)876 __INTRIN_INLINE_SSE __m64 _mm_cvtps_pi32(__m128 __a)
877 {
878     return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
879 }
880 
881 // _mm_cvtt_ss2si
_mm_cvttss_si32(__m128 __a)882 __INTRIN_INLINE_SSE int _mm_cvttss_si32(__m128 __a)
883 {
884     return __builtin_ia32_cvttss2si((__v4sf)__a);
885 }
886 
887 #ifdef _M_AMD64
_mm_cvttss_si64(__m128 __a)888 __INTRIN_INLINE_SSE long long _mm_cvttss_si64(__m128 __a)
889 {
890     return __builtin_ia32_cvttss2si64((__v4sf)__a);
891 }
892 #endif
893 
894 // _mm_cvtt_ps2pi
_mm_cvttps_pi32(__m128 __a)895 __INTRIN_INLINE_SSE __m64 _mm_cvttps_pi32(__m128 __a)
896 {
897     return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
898 }
899 
900 // _mm_cvt_si2ss
_mm_cvtsi32_ss(__m128 __a,int __b)901 __INTRIN_INLINE_SSE __m128 _mm_cvtsi32_ss(__m128 __a, int __b)
902 {
903     __a[0] = __b;
904     return __a;
905 }
906 
907 #ifdef _M_AMD64
_mm_cvtsi64_ss(__m128 __a,long long __b)908 __INTRIN_INLINE_SSE __m128 _mm_cvtsi64_ss(__m128 __a, long long __b)
909 {
910     __a[0] = __b;
911     return __a;
912 }
913 #endif
914 
915 // _mm_cvt_pi2ps
_mm_cvtpi32_ps(__m128 __a,__m64 __b)916 __INTRIN_INLINE_SSE __m128 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
917 {
918     return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
919 }
920 
_mm_cvtss_f32(__m128 __a)921 __INTRIN_INLINE_SSE float _mm_cvtss_f32(__m128 __a)
922 {
923     return __a[0];
924 }
925 
_mm_loadh_pi(__m128 __a,const __m64 * __p)926 __INTRIN_INLINE_SSE __m128 _mm_loadh_pi(__m128 __a, const __m64 *__p)
927 {
928 #ifdef  __clang__
929     typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
930     struct __mm_loadh_pi_struct {
931         __mm_loadh_pi_v2f32 __u;
932     } __attribute__((__packed__, __may_alias__));
933     __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
934     __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
935     return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
936 #else
937     return (__m128)__builtin_ia32_loadhps(__a, __p);
938 #endif
939 }
940 
_mm_loadl_pi(__m128 __a,const __m64 * __p)941 __INTRIN_INLINE_SSE __m128 _mm_loadl_pi(__m128 __a, const __m64 *__p)
942 {
943 #ifdef  __clang__
944     typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
945     struct __mm_loadl_pi_struct {
946         __mm_loadl_pi_v2f32 __u;
947     } __attribute__((__packed__, __may_alias__));
948     __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
949     __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
950     return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
951 #else
952     return (__m128)__builtin_ia32_loadlps(__a, __p);
953 #endif
954 }
955 
_mm_load_ss(const float * __p)956 __INTRIN_INLINE_SSE __m128 _mm_load_ss(const float *__p)
957 {
958     return _mm_set_ss(*__p);
959 }
960 
961 // _mm_load_ps1
_mm_load1_ps(const float * __p)962 __INTRIN_INLINE_SSE __m128 _mm_load1_ps(const float *__p)
963 {
964     return _mm_set1_ps(*__p);
965 }
966 
_mm_load_ps(const float * __p)967 __INTRIN_INLINE_SSE __m128 _mm_load_ps(const float *__p)
968 {
969     return *(const __m128*)__p;
970 }
971 
_mm_loadu_ps(const float * __p)972 __INTRIN_INLINE_SSE __m128 _mm_loadu_ps(const float *__p)
973 {
974     struct __loadu_ps {
975         __m128_u __v;
976     } __attribute__((__packed__, __may_alias__));
977     return ((const struct __loadu_ps*)__p)->__v;
978 }
979 
_mm_loadr_ps(const float * __p)980 __INTRIN_INLINE_SSE __m128 _mm_loadr_ps(const float *__p)
981 {
982     __m128 __a = _mm_load_ps(__p);
983 #ifdef  __clang__
984     return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
985 #else
986     return (__m128)__builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3));
987 #endif
988 }
989 
_mm_undefined_ps(void)990 __INTRIN_INLINE_SSE __m128 _mm_undefined_ps(void)
991 {
992 #ifdef __clang__
993     return (__m128)__builtin_ia32_undef128();
994 #else
995     __m128 undef = undef;
996     return undef;
997 #endif
998 }
999 
_mm_set_ss(float __w)1000 __INTRIN_INLINE_SSE __m128 _mm_set_ss(float __w)
1001 {
1002     return __extension__ (__m128){ __w, 0, 0, 0 };
1003 }
1004 
1005 // _mm_set_ps1
_mm_set1_ps(float __w)1006 __INTRIN_INLINE_SSE __m128 _mm_set1_ps(float __w)
1007 {
1008     return __extension__ (__m128){ __w, __w, __w, __w };
1009 }
1010 
_mm_set_ps(float __z,float __y,float __x,float __w)1011 __INTRIN_INLINE_SSE __m128 _mm_set_ps(float __z, float __y, float __x, float __w)
1012 {
1013     return __extension__ (__m128){ __w, __x, __y, __z };
1014 }
1015 
_mm_setr_ps(float __z,float __y,float __x,float __w)1016 __INTRIN_INLINE_SSE __m128 _mm_setr_ps(float __z, float __y, float __x, float __w)
1017 {
1018     return __extension__ (__m128){ __z, __y, __x, __w };
1019 }
1020 
_mm_setzero_ps(void)1021 __INTRIN_INLINE_SSE __m128 _mm_setzero_ps(void)
1022 {
1023     return __extension__ (__m128){ 0, 0, 0, 0 };
1024 }
1025 
_mm_storeh_pi(__m64 * __p,__m128 __a)1026 __INTRIN_INLINE_SSE void _mm_storeh_pi(__m64 *__p, __m128 __a)
1027 {
1028 #ifdef __clang__
1029     typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1030     struct __mm_storeh_pi_struct {
1031         __mm_storeh_pi_v2f32 __u;
1032     } __attribute__((__packed__, __may_alias__));
1033     ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1034 #else
1035     __builtin_ia32_storehps(__p, __a);
1036 #endif
1037 }
1038 
_mm_storel_pi(__m64 * __p,__m128 __a)1039 __INTRIN_INLINE_SSE void _mm_storel_pi(__m64 *__p, __m128 __a)
1040 {
1041 #ifdef __clang__
1042     typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1043     struct __mm_storeh_pi_struct {
1044         __mm_storeh_pi_v2f32 __u;
1045     } __attribute__((__packed__, __may_alias__));
1046     ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1047 #else
1048     __builtin_ia32_storelps(__p, __a);
1049 #endif
1050 }
1051 
_mm_store_ss(float * __p,__m128 __a)1052 __INTRIN_INLINE_SSE void _mm_store_ss(float *__p, __m128 __a)
1053 {
1054     *__p = ((__v4sf)__a)[0];
1055 }
1056 
_mm_storeu_ps(float * __p,__m128 __a)1057 __INTRIN_INLINE_SSE void _mm_storeu_ps(float *__p, __m128 __a)
1058 {
1059     *(__m128_u *)__p = __a;
1060 }
1061 
_mm_store_ps(float * __p,__m128 __a)1062 __INTRIN_INLINE_SSE void _mm_store_ps(float *__p, __m128 __a)
1063 {
1064     *(__m128*)__p = __a;
1065 }
1066 
1067 // _mm_store_ps1
_mm_store1_ps(float * __p,__m128 __a)1068 __INTRIN_INLINE_SSE void _mm_store1_ps(float *__p, __m128 __a)
1069 {
1070     // FIXME: Should we use a temp instead?
1071 #ifdef __clang__
1072      __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
1073 #else
1074     __a = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,0,0,0));
1075 #endif
1076     _mm_store_ps(__p, __a);
1077 }
1078 
_mm_storer_ps(float * __p,__m128 __a)1079 __INTRIN_INLINE_SSE void _mm_storer_ps(float *__p, __m128 __a)
1080 {
1081 #ifdef  __clang__
1082     __m128 __tmp = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1083 #else
1084     __m128 __tmp = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3));
1085 #endif
1086     _mm_store_ps(__p, __tmp);
1087 }
1088 
1089 /* GCC / Clang specific consants */
1090 #define _MM_HINT_NTA_ALT 0
1091 #define _MM_HINT_T0_ALT  3
1092 #define _MM_HINT_T1_ALT  2
1093 #define _MM_HINT_T2_ALT  1
1094 #define _MM_HINT_ENTA_ALT 4
1095 
1096 // These are not supported yet
1097 //#define _MM_HINT_ET0_ALT 7
1098 //#define _MM_HINT_ET1_ALT 6
1099 //#define _MM_HINT_ET2_ALT 5
1100 
1101 #define _MM_HINT_MS_TO_ALT(sel) \
1102    (((sel) == _MM_HINT_NTA) ? _MM_HINT_NTA_ALT : \
1103     ((sel) == _MM_HINT_T0) ? _MM_HINT_T0_ALT : \
1104     ((sel) == _MM_HINT_T1) ? _MM_HINT_T1_ALT : \
1105     ((sel) == _MM_HINT_T2) ? _MM_HINT_T2_ALT : \
1106     ((sel) == _MM_HINT_ENTA) ? _MM_HINT_ENTA_ALT : 0)
1107 
1108 #ifdef _MSC_VER1
1109 
1110 /* On clang-cl we have an intrinsic, but the constants are different */
1111 #pragma intrinsic(_mm_prefetch)
1112 #define _mm_prefetch(p, sel) _mm_prefetch(p, _MM_HINT_MS_TO_ALT(sel))
1113 
1114 #else /* _MSC_VER */
1115 
1116 #define _mm_prefetch(p, sel) \
1117     __builtin_prefetch((const void *)(p), (_MM_HINT_MS_TO_ALT(sel) >> 2) & 1, _MM_HINT_MS_TO_ALT(sel) & 0x3)
1118 
1119 #endif /* _MSC_VER */
1120 
_mm_stream_pi(__m64 * __p,__m64 __a)1121 __INTRIN_INLINE_SSE void _mm_stream_pi(__m64 *__p, __m64 __a)
1122 {
1123 #ifdef __clang__
1124     __builtin_ia32_movntq((__v1di*)__p, __a);
1125 #else
1126     __builtin_ia32_movntq((long long unsigned int *)__p, (long long unsigned int)__a);
1127 #endif
1128 }
1129 
_mm_stream_ps(float * __p,__m128 __a)1130 __INTRIN_INLINE_SSE void _mm_stream_ps(float *__p, __m128 __a)
1131 {
1132 #ifdef __clang__
1133     __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
1134 #else
1135     __builtin_ia32_movntps(__p, (__v4sf)__a);
1136 #endif
1137 }
1138 
1139 #if !HAS_BUILTIN(_mm_sfence)
_mm_sfence(void)1140 __INTRIN_INLINE_SSE void _mm_sfence(void)
1141 {
1142     __builtin_ia32_sfence();
1143 }
1144 #endif
1145 
1146 #ifdef __clang__
1147 #define _m_pextrw(a, n) \
1148     ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
1149 
1150 #define _m_pinsrw(a, d, n) \
1151     ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
1152 #else
1153 // _m_pextrw
_mm_extract_pi16(__m64 const __a,int const __n)1154 __INTRIN_INLINE_SSE int _mm_extract_pi16(__m64 const __a, int const __n)
1155 {
1156     return (unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)__a, __n);
1157 }
1158 
1159 // _m_pinsrw
_mm_insert_pi16(__m64 const __a,int const __d,int const __n)1160 __INTRIN_INLINE_SSE __m64 _mm_insert_pi16 (__m64 const __a, int const __d, int const __n)
1161 {
1162     return (__m64)__builtin_ia32_vec_set_v4hi ((__v4hi)__a, __d, __n);
1163 }
1164 
1165 #endif
1166 
1167 // _m_pmaxsw
_mm_max_pi16(__m64 __a,__m64 __b)1168 __INTRIN_INLINE_SSE __m64 _mm_max_pi16(__m64 __a, __m64 __b)
1169 {
1170     return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
1171 }
1172 
1173 // _m_pmaxub
_mm_max_pu8(__m64 __a,__m64 __b)1174 __INTRIN_INLINE_SSE __m64 _mm_max_pu8(__m64 __a, __m64 __b)
1175 {
1176     return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
1177 }
1178 
1179 // _m_pminsw
_mm_min_pi16(__m64 __a,__m64 __b)1180 __INTRIN_INLINE_SSE __m64 _mm_min_pi16(__m64 __a, __m64 __b)
1181 {
1182     return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
1183 }
1184 
1185 // _m_pminub
_mm_min_pu8(__m64 __a,__m64 __b)1186 __INTRIN_INLINE_SSE __m64 _mm_min_pu8(__m64 __a, __m64 __b)
1187 {
1188     return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
1189 }
1190 
1191 // _m_pmovmskb
_mm_movemask_pi8(__m64 __a)1192 __INTRIN_INLINE_SSE int _mm_movemask_pi8(__m64 __a)
1193 {
1194     return __builtin_ia32_pmovmskb((__v8qi)__a);
1195 }
1196 
1197 // _m_pmulhuw
_mm_mulhi_pu16(__m64 __a,__m64 __b)1198 __INTRIN_INLINE_SSE __m64 _mm_mulhi_pu16(__m64 __a, __m64 __b)
1199 {
1200     return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
1201 }
1202 
1203 #ifdef __clang__
1204 #define _m_pshufw(a, n) \
1205     ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
1206 #else
1207 // _m_pshufw
_mm_shuffle_pi16(__m64 __a,int const __n)1208 __INTRIN_INLINE_MMX __m64 _mm_shuffle_pi16 (__m64 __a, int const __n)
1209 {
1210     return (__m64) __builtin_ia32_pshufw ((__v4hi)__a, __n);
1211 }
1212 #endif
1213 
1214 // _m_maskmovq
_mm_maskmove_si64(__m64 __d,__m64 __n,char * __p)1215 __INTRIN_INLINE_SSE void _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
1216 {
1217     __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
1218 }
1219 
1220 // _m_pavgb
_mm_avg_pu8(__m64 __a,__m64 __b)1221 __INTRIN_INLINE_SSE __m64 _mm_avg_pu8(__m64 __a, __m64 __b)
1222 {
1223     return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
1224 }
1225 
1226 // _m_pavgw
_mm_avg_pu16(__m64 __a,__m64 __b)1227 __INTRIN_INLINE_SSE __m64 _mm_avg_pu16(__m64 __a, __m64 __b)
1228 {
1229     return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
1230 }
1231 
1232 // _m_psadbw
_mm_sad_pu8(__m64 __a,__m64 __b)1233 __INTRIN_INLINE_SSE __m64 _mm_sad_pu8(__m64 __a, __m64 __b)
1234 {
1235     return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
1236 }
1237 
1238 #endif // __GNUC__
1239 
1240 #ifdef __cplusplus
1241 }
1242 #endif // __cplusplus
1243 
1244 #endif /* _INCLUDED_MM2 */
1245