1 /*
2 * xmmintrin.h
3 *
4 * This file is part of the ReactOS CRT package.
5 *
6 * Contributors:
7 * Timo Kreuzer (timo.kreuzer@reactos.org)
8 *
9 * THIS SOFTWARE IS NOT COPYRIGHTED
10 *
11 * This source code is offered for use in the public domain. You may
12 * use, modify or distribute it freely.
13 *
14 * This code is distributed in the hope that it will be useful but
15 * WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY
16 * DISCLAIMED. This includes but is not limited to warranties of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18 *
19 */
20
21 #pragma once
22 #ifndef _INCLUDED_MM2
23 #define _INCLUDED_MM2
24
25 #include <mmintrin.h>
26
27 #if defined(_MM2_FUNCTIONALITY) && !defined(_MM_FUNCTIONALITY)
28 #define _MM_FUNCTIONALITY
29 #endif
30
31 #if !defined _VCRT_BUILD && !defined _INC_MALLOC
32 #include <malloc.h> // For _mm_malloc() and _mm_free()
33 #endif
34
35 #ifdef __cplusplus
36 extern "C" {
37 #endif
38
39 #if defined(_MSC_VER) && !defined(__clang__)
40
41 typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128
42 {
43 float m128_f32[4];
44 unsigned __int64 m128_u64[2];
45 __int8 m128_i8[16];
46 __int16 m128_i16[8];
47 __int32 m128_i32[4];
48 __int64 m128_i64[2];
49 unsigned __int8 m128_u8[16];
50 unsigned __int16 m128_u16[8];
51 unsigned __int32 m128_u32[4];
52 } __m128;
53
54 #define __ATTRIBUTE_SSE__
55
56 #else /* _MSC_VER */
57
58 typedef float __v4sf __attribute__((__vector_size__(16)));
59 typedef signed int __v4si __attribute__((__vector_size__(16)));
60 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
61 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
62
63 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
64
65 #ifdef __clang__
66 #define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"),__min_vector_width__(128)))
67 #else
68 #define __ATTRIBUTE_SSE__ __attribute__((__target__("sse")))
69 #endif
70 #define __INTRIN_INLINE_SSE __INTRIN_INLINE __ATTRIBUTE_SSE__
71
72 #endif /* _MSC_VER */
73
74 #define _MM_ALIGN16 _VCRT_ALIGN(16)
75
76 /* Constants for use with _mm_prefetch. */
77 #define _MM_HINT_NTA 0
78 #define _MM_HINT_T0 1
79 #define _MM_HINT_T1 2
80 #define _MM_HINT_T2 3
81 #define _MM_HINT_ENTA 4
82 #if 0 // Not supported yet
83 #define _MM_HINT_ET0 5
84 #define _MM_HINT_ET1 6
85 #define _MM_HINT_ET2 7
86 #endif
87
88 /* Create a selector for use with the SHUFPS instruction. */
89 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
90 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
91
92 /* Bits in the MXCSR. */
93 #define _MM_EXCEPT_MASK 0x003f
94 #define _MM_EXCEPT_INVALID 0x0001
95 #define _MM_EXCEPT_DENORM 0x0002
96 #define _MM_EXCEPT_DIV_ZERO 0x0004
97 #define _MM_EXCEPT_OVERFLOW 0x0008
98 #define _MM_EXCEPT_UNDERFLOW 0x0010
99 #define _MM_EXCEPT_INEXACT 0x0020
100
101 #define _MM_MASK_MASK 0x1f80
102 #define _MM_MASK_INVALID 0x0080
103 #define _MM_MASK_DENORM 0x0100
104 #define _MM_MASK_DIV_ZERO 0x0200
105 #define _MM_MASK_OVERFLOW 0x0400
106 #define _MM_MASK_UNDERFLOW 0x0800
107 #define _MM_MASK_INEXACT 0x1000
108
109 #define _MM_ROUND_MASK 0x6000
110 #define _MM_ROUND_NEAREST 0x0000
111 #define _MM_ROUND_DOWN 0x2000
112 #define _MM_ROUND_UP 0x4000
113 #define _MM_ROUND_TOWARD_ZERO 0x6000
114
115 #define _MM_FLUSH_ZERO_MASK 0x8000
116 #define _MM_FLUSH_ZERO_ON 0x8000
117 #define _MM_FLUSH_ZERO_OFF 0x0000
118
119 #ifdef __ICL
120 void* __cdecl _mm_malloc(size_t Size, size_t Al);
121 void __cdecl _mm_free(void* P);
122 #endif
123
124 void _mm_prefetch(_In_ char const* p, _In_ int i);
125 __m128 _mm_setzero_ps(void);
126 __m128 _mm_add_ss(__m128 a, __m128 b);
127 __m128 _mm_sub_ss(__m128 a, __m128 b);
128 __m128 _mm_mul_ss(__m128 a, __m128 b);
129 __m128 _mm_div_ss(__m128 a, __m128 b);
130 __m128 _mm_sqrt_ss(__m128 a);
131 __m128 _mm_rcp_ss(__m128 a);
132 __m128 _mm_rsqrt_ss(__m128 a);
133 __m128 _mm_min_ss(__m128 a, __m128 b);
134 __m128 _mm_max_ss(__m128 a, __m128 b);
135 __m128 _mm_add_ps(__m128 a, __m128 b);
136 __m128 _mm_sub_ps(__m128 a, __m128 b);
137 __m128 _mm_mul_ps(__m128 a, __m128 b);
138 __m128 _mm_div_ps(__m128 a, __m128 b);
139 __m128 _mm_sqrt_ps(__m128 a);
140 __m128 _mm_rcp_ps(__m128 a);
141 __m128 _mm_rsqrt_ps(__m128 a);
142 __m128 _mm_min_ps(__m128 a, __m128 b);
143 __m128 _mm_max_ps(__m128 a, __m128 b);
144 __m128 _mm_and_ps(__m128 a, __m128 b);
145 __m128 _mm_andnot_ps(__m128 a, __m128 b);
146 __m128 _mm_or_ps(__m128 a, __m128 b);
147 __m128 _mm_xor_ps(__m128 a, __m128 b);
148 __m128 _mm_cmpeq_ss(__m128 a, __m128 b);
149 __m128 _mm_cmplt_ss(__m128 a, __m128 b);
150 __m128 _mm_cmple_ss(__m128 a, __m128 b);
151 __m128 _mm_cmpgt_ss(__m128 a, __m128 b);
152 __m128 _mm_cmpge_ss(__m128 a, __m128 b);
153 __m128 _mm_cmpneq_ss(__m128 a, __m128 b);
154 __m128 _mm_cmpnlt_ss(__m128 a, __m128 b);
155 __m128 _mm_cmpnle_ss(__m128 a, __m128 b);
156 __m128 _mm_cmpngt_ss(__m128 a, __m128 b);
157 __m128 _mm_cmpnge_ss(__m128 a, __m128 b);
158 __m128 _mm_cmpord_ss(__m128 a, __m128 b);
159 __m128 _mm_cmpunord_ss(__m128 a, __m128 b);
160 __m128 _mm_cmpeq_ps(__m128 a, __m128 b);
161 __m128 _mm_cmplt_ps(__m128 a, __m128 b);
162 __m128 _mm_cmple_ps(__m128 a, __m128 b);
163 __m128 _mm_cmpgt_ps(__m128 a, __m128 b);
164 __m128 _mm_cmpge_ps(__m128 a, __m128 b);
165 __m128 _mm_cmpneq_ps(__m128 a, __m128 b);
166 __m128 _mm_cmpnlt_ps(__m128 a, __m128 b);
167 __m128 _mm_cmpnle_ps(__m128 a, __m128 b);
168 __m128 _mm_cmpngt_ps(__m128 a, __m128 b);
169 __m128 _mm_cmpnge_ps(__m128 a, __m128 b);
170 __m128 _mm_cmpord_ps(__m128 a, __m128 b);
171 __m128 _mm_cmpunord_ps(__m128 a, __m128 b);
172 int _mm_comieq_ss(__m128 a, __m128 b);
173 int _mm_comilt_ss(__m128 a, __m128 b);
174 int _mm_comile_ss(__m128 a, __m128 b);
175 int _mm_comigt_ss(__m128 a, __m128 b);
176 int _mm_comige_ss(__m128 a, __m128 b);
177 int _mm_comineq_ss(__m128 a, __m128 b);
178 int _mm_ucomieq_ss(__m128 a, __m128 b);
179 int _mm_ucomilt_ss(__m128 a, __m128 b);
180 int _mm_ucomile_ss(__m128 a, __m128 b);
181 int _mm_ucomigt_ss(__m128 a, __m128 b);
182 int _mm_ucomige_ss(__m128 a, __m128 b);
183 int _mm_ucomineq_ss(__m128 a, __m128 b);
184 int _mm_cvt_ss2si(__m128 a);
185 int _mm_cvtt_ss2si(__m128 a);
186 __m128 _mm_cvt_si2ss(__m128 a, int b);
187 #ifdef _M_IX86
188 __m64 _mm_cvt_ps2pi(__m128 a);
189 __m64 _mm_cvtt_ps2pi(__m128 a);
190 __m128 _mm_cvt_pi2ps(__m128 a, __m64 b);
191 #endif
192 __m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8);
193 __m128 _mm_unpackhi_ps(__m128 a, __m128 b);
194 __m128 _mm_unpacklo_ps(__m128 a, __m128 b);
195 __m128 _mm_loadh_pi(__m128 a, __m64 const* p);
196 void _mm_storeh_pi(__m64* p, __m128 a);
197 __m128 _mm_movehl_ps(__m128 a, __m128 b);
198 __m128 _mm_movelh_ps(__m128 a, __m128 b);
199 __m128 _mm_loadl_pi(__m128 a, __m64 const* p);
200 void _mm_storel_pi(__m64* p, __m128 a);
201 int _mm_movemask_ps(__m128 a);
202 unsigned int _mm_getcsr(void);
203 void _mm_setcsr(unsigned int a);
204 __m128 _mm_set_ss(float a);
205 __m128 _mm_set_ps1(float a);
206 __m128 _mm_load_ss(float const* p);
207 __m128 _mm_load_ps1(float const* p);
208 __m128 _mm_load_ps(float const* p);
209 __m128 _mm_loadu_ps(float const* p);
210 __m128 _mm_loadr_ps(float const* p);
211 __m128 _mm_set_ps(float e3, float e2, float e1, float e0);
212 __m128 _mm_setr_ps(float e3, float e2, float e1, float e0);
213 void _mm_store_ss(float* p, __m128 a);
214 float _mm_cvtss_f32(__m128 a);
215 void _mm_store_ps(float* p, __m128 a);
216 void _mm_storeu_ps(float* p, __m128 a);
217 void _mm_store_ps1(float* p, __m128 a);
218 void _mm_storer_ps(float* p, __m128 a);
219 __m128 _mm_move_ss(__m128 a, __m128 b);
220 #ifdef _M_IX86
221 int _m_pextrw(__m64 a, int imm8);
222 __m64 _m_pinsrw(__m64 a, int i, int imm8);
223 __m64 _m_pmaxsw(__m64 a, __m64 b);
224 __m64 _m_pmaxub(__m64 a, __m64 b);
225 __m64 _m_pminsw(__m64 a, __m64 b);
226 __m64 _m_pminub(__m64 a, __m64 b);
227 int _m_pmovmskb(__m64 a);
228 __m64 _m_pmulhuw(__m64 a, __m64 b);
229 __m64 _m_pshufw(__m64 a, int imm8);
230 void _m_maskmovq(__m64 a, __m64 b, char*);
231 __m64 _m_pavgb(__m64 a, __m64 b);
232 __m64 _m_pavgw(__m64 a, __m64 b);
233 __m64 _m_psadbw(__m64 a, __m64 b);
234 void _mm_stream_pi(__m64* p, __m64 a);
235 #endif
236 void _mm_stream_ps(float* p, __m128 a);
237 void _mm_sfence(void);
238 #ifdef _M_AMD64
239 __int64 _mm_cvtss_si64(__m128 a);
240 __int64 _mm_cvttss_si64(__m128 a);
241 __m128 _mm_cvtsi64_ss(__m128 a, __int64 b);
242 #endif
243
244 /* Alternate names */
245 #define _mm_cvtss_si32 _mm_cvt_ss2si
246 #define _mm_cvttss_si32 _mm_cvtt_ss2si
247 #define _mm_cvtsi32_ss _mm_cvt_si2ss
248 #define _mm_set1_ps _mm_set_ps1
249 #define _mm_load1_ps _mm_load_ps1f
250 #define _mm_store1_ps _mm_store_ps1
251 #define _mm_cvtps_pi32 _mm_cvt_ps2pi
252 #define _mm_cvttps_pi32 _mm_cvtt_ps2pi
253 #define _mm_cvtpi32_ps _mm_cvt_pi2ps
254 #define _mm_extract_pi16 _m_pextrw
255 #define _mm_insert_pi16 _m_pinsrw
256 #define _mm_max_pi16 _m_pmaxsw
257 #define _mm_max_pu8 _m_pmaxub
258 #define _mm_min_pi16 _m_pminsw
259 #define _mm_min_pu8 _m_pminub
260 #define _mm_movemask_pi8 _m_pmovmskb
261 #define _mm_mulhi_pu16 _m_pmulhuw
262 #define _mm_shuffle_pi16 _m_pshufw
263 #define _mm_maskmove_si64 _m_maskmovq
264 #define _mm_avg_pu8 _m_pavgb
265 #define _mm_avg_pu16 _m_pavgw
266 #define _mm_sad_pu8 _m_psadbw
267
268 #ifdef _M_IX86
269 /* Inline functions from Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h */
270
271 __ATTRIBUTE_SSE__
_mm_cvtpi16_ps(__m64 __a)272 static __inline __m128 _mm_cvtpi16_ps(__m64 __a)
273 {
274 __m64 __b, __c;
275 __m128 __r;
276
277 __b = _mm_setzero_si64();
278 __b = _mm_cmpgt_pi16(__b, __a);
279 __c = _mm_unpackhi_pi16(__a, __b);
280 __r = _mm_setzero_ps();
281 __r = _mm_cvtpi32_ps(__r, __c);
282 __r = _mm_movelh_ps(__r, __r);
283 __c = _mm_unpacklo_pi16(__a, __b);
284 __r = _mm_cvtpi32_ps(__r, __c);
285
286 return __r;
287 }
288
289 __ATTRIBUTE_SSE__
_mm_cvtpu16_ps(__m64 __a)290 static __inline __m128 _mm_cvtpu16_ps(__m64 __a)
291 {
292 __m64 __b, __c;
293 __m128 __r;
294
295 __b = _mm_setzero_si64();
296 __c = _mm_unpackhi_pi16(__a, __b);
297 __r = _mm_setzero_ps();
298 __r = _mm_cvtpi32_ps(__r, __c);
299 __r = _mm_movelh_ps(__r, __r);
300 __c = _mm_unpacklo_pi16(__a, __b);
301 __r = _mm_cvtpi32_ps(__r, __c);
302
303 return __r;
304 }
305
306 __ATTRIBUTE_SSE__
_mm_cvtpi8_ps(__m64 __a)307 static __inline __m128 _mm_cvtpi8_ps(__m64 __a)
308 {
309 __m64 __b;
310
311 __b = _mm_setzero_si64();
312 __b = _mm_cmpgt_pi8(__b, __a);
313 __b = _mm_unpacklo_pi8(__a, __b);
314
315 return _mm_cvtpi16_ps(__b);
316 }
317
318 __ATTRIBUTE_SSE__
_mm_cvtpu8_ps(__m64 __a)319 static __inline __m128 _mm_cvtpu8_ps(__m64 __a)
320 {
321 __m64 __b;
322
323 __b = _mm_setzero_si64();
324 __b = _mm_unpacklo_pi8(__a, __b);
325
326 return _mm_cvtpi16_ps(__b);
327 }
328
329 __ATTRIBUTE_SSE__
_mm_cvtpi32x2_ps(__m64 __a,__m64 __b)330 static __inline __m128 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
331 {
332 __m128 __c;
333
334 __c = _mm_setzero_ps();
335 __c = _mm_cvtpi32_ps(__c, __b);
336 __c = _mm_movelh_ps(__c, __c);
337
338 return _mm_cvtpi32_ps(__c, __a);
339 }
340
341 __ATTRIBUTE_SSE__
_mm_cvtps_pi16(__m128 __a)342 static __inline __m64 _mm_cvtps_pi16(__m128 __a)
343 {
344 __m64 __b, __c;
345
346 __b = _mm_cvtps_pi32(__a);
347 __a = _mm_movehl_ps(__a, __a);
348 __c = _mm_cvtps_pi32(__a);
349
350 return _mm_packs_pi32(__b, __c);
351 }
352
353 __ATTRIBUTE_SSE__
_mm_cvtps_pi8(__m128 __a)354 static __inline __m64 _mm_cvtps_pi8(__m128 __a)
355 {
356 __m64 __b, __c;
357
358 __b = _mm_cvtps_pi16(__a);
359 __c = _mm_setzero_si64();
360
361 return _mm_packs_pi16(__b, __c);
362 }
363
364 #endif /* _M_IX86 */
365
366 /* Transpose the 4x4 matrix composed of row[0-3]. */
367 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
368 do { \
369 __m128 t0 = _mm_unpacklo_ps(row0, row1); \
370 __m128 t1 = _mm_unpacklo_ps(row2, row3); \
371 __m128 t2 = _mm_unpackhi_ps(row0, row1); \
372 __m128 t3 = _mm_unpackhi_ps(row2, row3); \
373 (row0) = _mm_movelh_ps(t0, t1); \
374 (row1) = _mm_movehl_ps(t1, t0); \
375 (row2) = _mm_movelh_ps(t2, t3); \
376 (row3) = _mm_movehl_ps(t3, t2); \
377 } while (0)
378
379 #define _MM_GET_EXCEPTION_STATE() \
380 (_mm_getcsr() & _MM_EXCEPT_MASK)
381
382 #define _MM_GET_EXCEPTION_MASK() \
383 (_mm_getcsr() & _MM_MASK_MASK)
384
385 #define _MM_GET_ROUNDING_MODE() \
386 (_mm_getcsr() & _MM_ROUND_MASK)
387
388 #define _MM_GET_FLUSH_ZERO_MODE() \
389 (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
390
391 #define _MM_SET_EXCEPTION_STATE(__mask) \
392 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (__mask))
393
394 #define _MM_SET_EXCEPTION_MASK(__mask) \
395 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (__mask))
396
397 #define _MM_SET_ROUNDING_MODE(__mode) \
398 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (__mode))
399
400 #define _MM_SET_FLUSH_ZERO_MODE(__mode) \
401 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (__mode))
402
403 /* Use intrinsics on MSVC */
404 #if defined(_MSC_VER) && !defined(__clang__)
405 #pragma intrinsic(_mm_prefetch)
406 #pragma intrinsic(_mm_setzero_ps)
407 #pragma intrinsic(_mm_add_ss)
408 #pragma intrinsic(_mm_sub_ss)
409 #pragma intrinsic(_mm_mul_ss)
410 #pragma intrinsic(_mm_div_ss)
411 #pragma intrinsic(_mm_sqrt_ss)
412 #pragma intrinsic(_mm_rcp_ss)
413 #pragma intrinsic(_mm_rsqrt_ss)
414 #pragma intrinsic(_mm_min_ss)
415 #pragma intrinsic(_mm_max_ss)
416 #pragma intrinsic(_mm_add_ps)
417 #pragma intrinsic(_mm_sub_ps)
418 #pragma intrinsic(_mm_mul_ps)
419 #pragma intrinsic(_mm_div_ps)
420 #pragma intrinsic(_mm_sqrt_ps)
421 #pragma intrinsic(_mm_rcp_ps)
422 #pragma intrinsic(_mm_rsqrt_ps)
423 #pragma intrinsic(_mm_min_ps)
424 #pragma intrinsic(_mm_max_ps)
425 #pragma intrinsic(_mm_and_ps)
426 #pragma intrinsic(_mm_andnot_ps)
427 #pragma intrinsic(_mm_or_ps)
428 #pragma intrinsic(_mm_xor_ps)
429 #pragma intrinsic(_mm_cmpeq_ss)
430 #pragma intrinsic(_mm_cmplt_ss)
431 #pragma intrinsic(_mm_cmple_ss)
432 #pragma intrinsic(_mm_cmpgt_ss)
433 #pragma intrinsic(_mm_cmpge_ss)
434 #pragma intrinsic(_mm_cmpneq_ss)
435 #pragma intrinsic(_mm_cmpnlt_ss)
436 #pragma intrinsic(_mm_cmpnle_ss)
437 #pragma intrinsic(_mm_cmpngt_ss)
438 #pragma intrinsic(_mm_cmpnge_ss)
439 #pragma intrinsic(_mm_cmpord_ss)
440 #pragma intrinsic(_mm_cmpunord_ss)
441 #pragma intrinsic(_mm_cmpeq_ps)
442 #pragma intrinsic(_mm_cmplt_ps)
443 #pragma intrinsic(_mm_cmple_ps)
444 #pragma intrinsic(_mm_cmpgt_ps)
445 #pragma intrinsic(_mm_cmpge_ps)
446 #pragma intrinsic(_mm_cmpneq_ps)
447 #pragma intrinsic(_mm_cmpnlt_ps)
448 #pragma intrinsic(_mm_cmpnle_ps)
449 #pragma intrinsic(_mm_cmpngt_ps)
450 #pragma intrinsic(_mm_cmpnge_ps)
451 #pragma intrinsic(_mm_cmpord_ps)
452 #pragma intrinsic(_mm_cmpunord_ps)
453 #pragma intrinsic(_mm_comieq_ss)
454 #pragma intrinsic(_mm_comilt_ss)
455 #pragma intrinsic(_mm_comile_ss)
456 #pragma intrinsic(_mm_comigt_ss)
457 #pragma intrinsic(_mm_comige_ss)
458 #pragma intrinsic(_mm_comineq_ss)
459 #pragma intrinsic(_mm_ucomieq_ss)
460 #pragma intrinsic(_mm_ucomilt_ss)
461 #pragma intrinsic(_mm_ucomile_ss)
462 #pragma intrinsic(_mm_ucomigt_ss)
463 #pragma intrinsic(_mm_ucomige_ss)
464 #pragma intrinsic(_mm_ucomineq_ss)
465 #pragma intrinsic(_mm_cvt_ss2si)
466 #pragma intrinsic(_mm_cvtt_ss2si)
467 #pragma intrinsic(_mm_cvt_si2ss)
468 #ifdef _M_IX86
469 #pragma intrinsic(_mm_cvt_ps2pi)
470 #pragma intrinsic(_mm_cvtt_ps2pi)
471 #pragma intrinsic(_mm_cvt_pi2ps)
472 #endif // _M_IX86
473 #pragma intrinsic(_mm_shuffle_ps)
474 #pragma intrinsic(_mm_unpackhi_ps)
475 #pragma intrinsic(_mm_unpacklo_ps)
476 #pragma intrinsic(_mm_loadh_pi)
477 #pragma intrinsic(_mm_storeh_pi)
478 #pragma intrinsic(_mm_movehl_ps)
479 #pragma intrinsic(_mm_movelh_ps)
480 #pragma intrinsic(_mm_loadl_pi)
481 #pragma intrinsic(_mm_storel_pi)
482 #pragma intrinsic(_mm_movemask_ps)
483 #pragma intrinsic(_mm_getcsr)
484 #pragma intrinsic(_mm_setcsr)
485 #pragma intrinsic(_mm_set_ss)
486 #pragma intrinsic(_mm_set_ps1)
487 #pragma intrinsic(_mm_load_ss)
488 #pragma intrinsic(_mm_load_ps1)
489 #pragma intrinsic(_mm_load_ps)
490 #pragma intrinsic(_mm_loadu_ps)
491 #pragma intrinsic(_mm_loadr_ps)
492 #pragma intrinsic(_mm_set_ps)
493 #pragma intrinsic(_mm_setr_ps)
494 #pragma intrinsic(_mm_store_ss)
495 #pragma intrinsic(_mm_cvtss_f32)
496 #pragma intrinsic(_mm_store_ps)
497 #pragma intrinsic(_mm_storeu_ps)
498 #pragma intrinsic(_mm_store_ps1)
499 #pragma intrinsic(_mm_storer_ps)
500 #pragma intrinsic(_mm_move_ss)
501 #ifdef _M_IX86
502 #pragma intrinsic(_m_pextrw)
503 #pragma intrinsic(_m_pinsrw)
504 #pragma intrinsic(_m_pmaxsw)
505 #pragma intrinsic(_m_pmaxub)
506 #pragma intrinsic(_m_pminsw)
507 #pragma intrinsic(_m_pminub)
508 #pragma intrinsic(_m_pmovmskb)
509 #pragma intrinsic(_m_pmulhuw)
510 #pragma intrinsic(_m_pshufw)
511 #pragma intrinsic(_m_maskmovq)
512 #pragma intrinsic(_m_pavgb)
513 #pragma intrinsic(_m_pavgw)
514 #pragma intrinsic(_m_psadbw)
515 #pragma intrinsic(_mm_stream_pi)
516 #endif // _M_IX86
517 #pragma intrinsic(_mm_stream_ps)
518 #pragma intrinsic(_mm_sfence)
519 #ifdef _M_AMD64
520 #pragma intrinsic(_mm_cvtss_si64)
521 #pragma intrinsic(_mm_cvttss_si64)
522 #pragma intrinsic(_mm_cvtsi64_ss)
523 #endif // _M_AMD64
524
525 #else /* _MSC_VER */
526
527 /*
528 GCC: https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/xmmintrin.h
529 Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h
530 */
531
532 /* Use inline functions on GCC/Clang */
533
534 #if !HAS_BUILTIN(_mm_getcsr)
_mm_getcsr(void)535 __INTRIN_INLINE_SSE unsigned int _mm_getcsr(void)
536 {
537 return __builtin_ia32_stmxcsr();
538 }
539 #endif
540
541 #if !HAS_BUILTIN(_mm_setcsr)
_mm_setcsr(unsigned int a)542 __INTRIN_INLINE_SSE void _mm_setcsr(unsigned int a)
543 {
544 __builtin_ia32_ldmxcsr(a);
545 }
546 #endif
547
_mm_add_ss(__m128 __a,__m128 __b)548 __INTRIN_INLINE_SSE __m128 _mm_add_ss(__m128 __a, __m128 __b)
549 {
550 __a[0] += __b[0];
551 return __a;
552 }
553
_mm_add_ps(__m128 __a,__m128 __b)554 __INTRIN_INLINE_SSE __m128 _mm_add_ps(__m128 __a, __m128 __b)
555 {
556 return (__m128)((__v4sf)__a + (__v4sf)__b);
557 }
558
_mm_sub_ss(__m128 __a,__m128 __b)559 __INTRIN_INLINE_SSE __m128 _mm_sub_ss(__m128 __a, __m128 __b)
560 {
561 __a[0] -= __b[0];
562 return __a;
563 }
564
_mm_sub_ps(__m128 __a,__m128 __b)565 __INTRIN_INLINE_SSE __m128 _mm_sub_ps(__m128 __a, __m128 __b)
566 {
567 return (__m128)((__v4sf)__a - (__v4sf)__b);
568 }
569
_mm_mul_ss(__m128 __a,__m128 __b)570 __INTRIN_INLINE_SSE __m128 _mm_mul_ss(__m128 __a, __m128 __b)
571 {
572 __a[0] *= __b[0];
573 return __a;
574 }
575
_mm_mul_ps(__m128 __a,__m128 __b)576 __INTRIN_INLINE_SSE __m128 _mm_mul_ps(__m128 __a, __m128 __b)
577 {
578 return (__m128)((__v4sf)__a * (__v4sf)__b);
579 }
580
_mm_div_ss(__m128 __a,__m128 __b)581 __INTRIN_INLINE_SSE __m128 _mm_div_ss(__m128 __a, __m128 __b)
582 {
583 __a[0] /= __b[0];
584 return __a;
585 }
586
_mm_div_ps(__m128 __a,__m128 __b)587 __INTRIN_INLINE_SSE __m128 _mm_div_ps(__m128 __a, __m128 __b)
588 {
589 return (__m128)((__v4sf)__a / (__v4sf)__b);
590 }
591
_mm_sqrt_ss(__m128 __a)592 __INTRIN_INLINE_SSE __m128 _mm_sqrt_ss(__m128 __a)
593 {
594 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
595 }
596
_mm_sqrt_ps(__m128 __a)597 __INTRIN_INLINE_SSE __m128 _mm_sqrt_ps(__m128 __a)
598 {
599 return __builtin_ia32_sqrtps((__v4sf)__a);
600 }
601
_mm_rcp_ss(__m128 __a)602 __INTRIN_INLINE_SSE __m128 _mm_rcp_ss(__m128 __a)
603 {
604 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
605 }
606
_mm_rcp_ps(__m128 __a)607 __INTRIN_INLINE_SSE __m128 _mm_rcp_ps(__m128 __a)
608 {
609 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
610 }
611
_mm_rsqrt_ss(__m128 __a)612 __INTRIN_INLINE_SSE __m128 _mm_rsqrt_ss(__m128 __a)
613 {
614 return __builtin_ia32_rsqrtss((__v4sf)__a);
615 }
616
_mm_rsqrt_ps(__m128 __a)617 __INTRIN_INLINE_SSE __m128 _mm_rsqrt_ps(__m128 __a)
618 {
619 return __builtin_ia32_rsqrtps((__v4sf)__a);
620 }
621
_mm_min_ss(__m128 __a,__m128 __b)622 __INTRIN_INLINE_SSE __m128 _mm_min_ss(__m128 __a, __m128 __b)
623 {
624 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
625 }
626
_mm_min_ps(__m128 __a,__m128 __b)627 __INTRIN_INLINE_SSE __m128 _mm_min_ps(__m128 __a, __m128 __b)
628 {
629 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
630 }
631
_mm_max_ss(__m128 __a,__m128 __b)632 __INTRIN_INLINE_SSE __m128 _mm_max_ss(__m128 __a, __m128 __b)
633 {
634 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
635 }
636
_mm_max_ps(__m128 __a,__m128 __b)637 __INTRIN_INLINE_SSE __m128 _mm_max_ps(__m128 __a, __m128 __b)
638 {
639 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
640 }
641
_mm_and_ps(__m128 __a,__m128 __b)642 __INTRIN_INLINE_SSE __m128 _mm_and_ps(__m128 __a, __m128 __b)
643 {
644 return (__m128)((__v4su)__a & (__v4su)__b);
645 }
646
_mm_andnot_ps(__m128 __a,__m128 __b)647 __INTRIN_INLINE_SSE __m128 _mm_andnot_ps(__m128 __a, __m128 __b)
648 {
649 return (__m128)(~(__v4su)__a & (__v4su)__b);
650 }
651
_mm_or_ps(__m128 __a,__m128 __b)652 __INTRIN_INLINE_SSE __m128 _mm_or_ps(__m128 __a, __m128 __b)
653 {
654 return (__m128)((__v4su)__a | (__v4su)__b);
655 }
656
_mm_xor_ps(__m128 __a,__m128 __b)657 __INTRIN_INLINE_SSE __m128 _mm_xor_ps(__m128 __a, __m128 __b)
658 {
659 return (__m128)((__v4su)__a ^ (__v4su)__b);
660 }
661
_mm_cmpeq_ss(__m128 __a,__m128 __b)662 __INTRIN_INLINE_SSE __m128 _mm_cmpeq_ss(__m128 __a, __m128 __b)
663 {
664 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
665 }
666
_mm_cmpeq_ps(__m128 __a,__m128 __b)667 __INTRIN_INLINE_SSE __m128 _mm_cmpeq_ps(__m128 __a, __m128 __b)
668 {
669 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
670 }
671
_mm_cmplt_ss(__m128 __a,__m128 __b)672 __INTRIN_INLINE_SSE __m128 _mm_cmplt_ss(__m128 __a, __m128 __b)
673 {
674 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
675 }
676
_mm_cmplt_ps(__m128 __a,__m128 __b)677 __INTRIN_INLINE_SSE __m128 _mm_cmplt_ps(__m128 __a, __m128 __b)
678 {
679 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
680 }
681
_mm_cmple_ss(__m128 __a,__m128 __b)682 __INTRIN_INLINE_SSE __m128 _mm_cmple_ss(__m128 __a, __m128 __b)
683 {
684 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
685 }
686
_mm_cmple_ps(__m128 __a,__m128 __b)687 __INTRIN_INLINE_SSE __m128 _mm_cmple_ps(__m128 __a, __m128 __b)
688 {
689 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
690 }
691
_mm_cmpgt_ss(__m128 __a,__m128 __b)692 __INTRIN_INLINE_SSE __m128 _mm_cmpgt_ss(__m128 __a, __m128 __b)
693 {
694 __v4sf temp = __builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a);
695 #ifdef __clang__
696 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
697 #else
698 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
699 #endif
700 }
701
_mm_cmpgt_ps(__m128 __a,__m128 __b)702 __INTRIN_INLINE_SSE __m128 _mm_cmpgt_ps(__m128 __a, __m128 __b)
703 {
704 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
705 }
706
_mm_cmpge_ss(__m128 __a,__m128 __b)707 __INTRIN_INLINE_SSE __m128 _mm_cmpge_ss(__m128 __a, __m128 __b)
708 {
709 __v4sf temp = __builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a);
710 #ifdef __clang__
711 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
712 #else
713 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
714 #endif
715 }
716
_mm_cmpge_ps(__m128 __a,__m128 __b)717 __INTRIN_INLINE_SSE __m128 _mm_cmpge_ps(__m128 __a, __m128 __b)
718 {
719 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
720 }
721
_mm_cmpneq_ss(__m128 __a,__m128 __b)722 __INTRIN_INLINE_SSE __m128 _mm_cmpneq_ss(__m128 __a, __m128 __b)
723 {
724 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
725 }
726
_mm_cmpneq_ps(__m128 __a,__m128 __b)727 __INTRIN_INLINE_SSE __m128 _mm_cmpneq_ps(__m128 __a, __m128 __b)
728 {
729 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
730 }
731
_mm_cmpnlt_ss(__m128 __a,__m128 __b)732 __INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
733 {
734 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
735 }
736
_mm_cmpnlt_ps(__m128 __a,__m128 __b)737 __INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
738 {
739 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
740 }
741
_mm_cmpnle_ss(__m128 __a,__m128 __b)742 __INTRIN_INLINE_SSE __m128 _mm_cmpnle_ss(__m128 __a, __m128 __b)
743 {
744 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
745 }
746
_mm_cmpnle_ps(__m128 __a,__m128 __b)747 __INTRIN_INLINE_SSE __m128 _mm_cmpnle_ps(__m128 __a, __m128 __b)
748 {
749 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
750 }
751
_mm_cmpngt_ss(__m128 __a,__m128 __b)752 __INTRIN_INLINE_SSE __m128 _mm_cmpngt_ss(__m128 __a, __m128 __b)
753 {
754 __v4sf temp = __builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a);
755 #ifdef __clang__
756 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
757 #else
758 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
759 #endif
760 }
761
_mm_cmpngt_ps(__m128 __a,__m128 __b)762 __INTRIN_INLINE_SSE __m128 _mm_cmpngt_ps(__m128 __a, __m128 __b)
763 {
764 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
765 }
766
_mm_cmpnge_ss(__m128 __a,__m128 __b)767 __INTRIN_INLINE_SSE __m128 _mm_cmpnge_ss(__m128 __a, __m128 __b)
768 {
769 __v4sf temp = (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a);
770 #ifdef __clang__
771 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
772 #else
773 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
774 #endif
775 }
776
_mm_cmpnge_ps(__m128 __a,__m128 __b)777 __INTRIN_INLINE_SSE __m128 _mm_cmpnge_ps(__m128 __a, __m128 __b)
778 {
779 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
780 }
781
_mm_cmpord_ss(__m128 __a,__m128 __b)782 __INTRIN_INLINE_SSE __m128 _mm_cmpord_ss(__m128 __a, __m128 __b)
783 {
784 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
785 }
786
_mm_cmpord_ps(__m128 __a,__m128 __b)787 __INTRIN_INLINE_SSE __m128 _mm_cmpord_ps(__m128 __a, __m128 __b)
788 {
789 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
790 }
791
_mm_cmpunord_ss(__m128 __a,__m128 __b)792 __INTRIN_INLINE_SSE __m128 _mm_cmpunord_ss(__m128 __a, __m128 __b)
793 {
794 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
795 }
796
_mm_cmpunord_ps(__m128 __a,__m128 __b)797 __INTRIN_INLINE_SSE __m128 _mm_cmpunord_ps(__m128 __a, __m128 __b)
798 {
799 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
800 }
801
_mm_comieq_ss(__m128 __a,__m128 __b)802 __INTRIN_INLINE_SSE int _mm_comieq_ss(__m128 __a, __m128 __b)
803 {
804 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
805 }
806
_mm_comilt_ss(__m128 __a,__m128 __b)807 __INTRIN_INLINE_SSE int _mm_comilt_ss(__m128 __a, __m128 __b)
808 {
809 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
810 }
811
_mm_comile_ss(__m128 __a,__m128 __b)812 __INTRIN_INLINE_SSE int _mm_comile_ss(__m128 __a, __m128 __b)
813 {
814 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
815 }
816
_mm_comigt_ss(__m128 __a,__m128 __b)817 __INTRIN_INLINE_SSE int _mm_comigt_ss(__m128 __a, __m128 __b)
818 {
819 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
820 }
821
_mm_comige_ss(__m128 __a,__m128 __b)822 __INTRIN_INLINE_SSE int _mm_comige_ss(__m128 __a, __m128 __b)
823 {
824 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
825 }
826
_mm_comineq_ss(__m128 __a,__m128 __b)827 __INTRIN_INLINE_SSE int _mm_comineq_ss(__m128 __a, __m128 __b)
828 {
829 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
830 }
831
_mm_ucomieq_ss(__m128 __a,__m128 __b)832 __INTRIN_INLINE_SSE int _mm_ucomieq_ss(__m128 __a, __m128 __b)
833 {
834 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
835 }
836
_mm_ucomilt_ss(__m128 __a,__m128 __b)837 __INTRIN_INLINE_SSE int _mm_ucomilt_ss(__m128 __a, __m128 __b)
838 {
839 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
840 }
841
_mm_ucomile_ss(__m128 __a,__m128 __b)842 __INTRIN_INLINE_SSE int _mm_ucomile_ss(__m128 __a, __m128 __b)
843 {
844 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
845 }
846
_mm_ucomigt_ss(__m128 __a,__m128 __b)847 __INTRIN_INLINE_SSE int _mm_ucomigt_ss(__m128 __a, __m128 __b)
848 {
849 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
850 }
851
_mm_ucomige_ss(__m128 __a,__m128 __b)852 __INTRIN_INLINE_SSE int _mm_ucomige_ss(__m128 __a, __m128 __b)
853 {
854 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
855 }
856
_mm_ucomineq_ss(__m128 __a,__m128 __b)857 __INTRIN_INLINE_SSE int _mm_ucomineq_ss(__m128 __a, __m128 __b)
858 {
859 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
860 }
861
862 // _mm_cvt_ss2si
_mm_cvtss_si32(__m128 __a)863 __INTRIN_INLINE_SSE int _mm_cvtss_si32(__m128 __a)
864 {
865 return __builtin_ia32_cvtss2si((__v4sf)__a);
866 }
867
868 #ifdef _M_AMD64
_mm_cvtss_si64(__m128 __a)869 __INTRIN_INLINE_SSE long long _mm_cvtss_si64(__m128 __a)
870 {
871 return __builtin_ia32_cvtss2si64((__v4sf)__a);
872 }
873 #endif
874
875 // _mm_cvt_ps2pi
_mm_cvtps_pi32(__m128 __a)876 __INTRIN_INLINE_SSE __m64 _mm_cvtps_pi32(__m128 __a)
877 {
878 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
879 }
880
881 // _mm_cvtt_ss2si
_mm_cvttss_si32(__m128 __a)882 __INTRIN_INLINE_SSE int _mm_cvttss_si32(__m128 __a)
883 {
884 return __builtin_ia32_cvttss2si((__v4sf)__a);
885 }
886
887 #ifdef _M_AMD64
_mm_cvttss_si64(__m128 __a)888 __INTRIN_INLINE_SSE long long _mm_cvttss_si64(__m128 __a)
889 {
890 return __builtin_ia32_cvttss2si64((__v4sf)__a);
891 }
892 #endif
893
894 // _mm_cvtt_ps2pi
_mm_cvttps_pi32(__m128 __a)895 __INTRIN_INLINE_SSE __m64 _mm_cvttps_pi32(__m128 __a)
896 {
897 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
898 }
899
900 // _mm_cvt_si2ss
_mm_cvtsi32_ss(__m128 __a,int __b)901 __INTRIN_INLINE_SSE __m128 _mm_cvtsi32_ss(__m128 __a, int __b)
902 {
903 __a[0] = __b;
904 return __a;
905 }
906
907 #ifdef _M_AMD64
_mm_cvtsi64_ss(__m128 __a,long long __b)908 __INTRIN_INLINE_SSE __m128 _mm_cvtsi64_ss(__m128 __a, long long __b)
909 {
910 __a[0] = __b;
911 return __a;
912 }
913 #endif
914
915 // _mm_cvt_pi2ps
_mm_cvtpi32_ps(__m128 __a,__m64 __b)916 __INTRIN_INLINE_SSE __m128 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
917 {
918 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
919 }
920
_mm_cvtss_f32(__m128 __a)921 __INTRIN_INLINE_SSE float _mm_cvtss_f32(__m128 __a)
922 {
923 return __a[0];
924 }
925
_mm_loadh_pi(__m128 __a,const __m64 * __p)926 __INTRIN_INLINE_SSE __m128 _mm_loadh_pi(__m128 __a, const __m64 *__p)
927 {
928 #ifdef __clang__
929 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
930 struct __mm_loadh_pi_struct {
931 __mm_loadh_pi_v2f32 __u;
932 } __attribute__((__packed__, __may_alias__));
933 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
934 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
935 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
936 #else
937 return (__m128)__builtin_ia32_loadhps(__a, __p);
938 #endif
939 }
940
_mm_loadl_pi(__m128 __a,const __m64 * __p)941 __INTRIN_INLINE_SSE __m128 _mm_loadl_pi(__m128 __a, const __m64 *__p)
942 {
943 #ifdef __clang__
944 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
945 struct __mm_loadl_pi_struct {
946 __mm_loadl_pi_v2f32 __u;
947 } __attribute__((__packed__, __may_alias__));
948 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
949 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
950 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
951 #else
952 return (__m128)__builtin_ia32_loadlps(__a, __p);
953 #endif
954 }
955
_mm_load_ss(const float * __p)956 __INTRIN_INLINE_SSE __m128 _mm_load_ss(const float *__p)
957 {
958 return _mm_set_ss(*__p);
959 }
960
961 // _mm_load_ps1
_mm_load1_ps(const float * __p)962 __INTRIN_INLINE_SSE __m128 _mm_load1_ps(const float *__p)
963 {
964 return _mm_set1_ps(*__p);
965 }
966
_mm_load_ps(const float * __p)967 __INTRIN_INLINE_SSE __m128 _mm_load_ps(const float *__p)
968 {
969 return *(const __m128*)__p;
970 }
971
_mm_loadu_ps(const float * __p)972 __INTRIN_INLINE_SSE __m128 _mm_loadu_ps(const float *__p)
973 {
974 struct __loadu_ps {
975 __m128_u __v;
976 } __attribute__((__packed__, __may_alias__));
977 return ((const struct __loadu_ps*)__p)->__v;
978 }
979
_mm_loadr_ps(const float * __p)980 __INTRIN_INLINE_SSE __m128 _mm_loadr_ps(const float *__p)
981 {
982 __m128 __a = _mm_load_ps(__p);
983 #ifdef __clang__
984 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
985 #else
986 return (__m128)__builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3));
987 #endif
988 }
989
_mm_undefined_ps(void)990 __INTRIN_INLINE_SSE __m128 _mm_undefined_ps(void)
991 {
992 #ifdef __clang__
993 return (__m128)__builtin_ia32_undef128();
994 #else
995 __m128 undef = undef;
996 return undef;
997 #endif
998 }
999
_mm_set_ss(float __w)1000 __INTRIN_INLINE_SSE __m128 _mm_set_ss(float __w)
1001 {
1002 return __extension__ (__m128){ __w, 0, 0, 0 };
1003 }
1004
1005 // _mm_set_ps1
_mm_set1_ps(float __w)1006 __INTRIN_INLINE_SSE __m128 _mm_set1_ps(float __w)
1007 {
1008 return __extension__ (__m128){ __w, __w, __w, __w };
1009 }
1010
_mm_set_ps(float __z,float __y,float __x,float __w)1011 __INTRIN_INLINE_SSE __m128 _mm_set_ps(float __z, float __y, float __x, float __w)
1012 {
1013 return __extension__ (__m128){ __w, __x, __y, __z };
1014 }
1015
_mm_setr_ps(float __z,float __y,float __x,float __w)1016 __INTRIN_INLINE_SSE __m128 _mm_setr_ps(float __z, float __y, float __x, float __w)
1017 {
1018 return __extension__ (__m128){ __z, __y, __x, __w };
1019 }
1020
_mm_setzero_ps(void)1021 __INTRIN_INLINE_SSE __m128 _mm_setzero_ps(void)
1022 {
1023 return __extension__ (__m128){ 0, 0, 0, 0 };
1024 }
1025
_mm_storeh_pi(__m64 * __p,__m128 __a)1026 __INTRIN_INLINE_SSE void _mm_storeh_pi(__m64 *__p, __m128 __a)
1027 {
1028 #ifdef __clang__
1029 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1030 struct __mm_storeh_pi_struct {
1031 __mm_storeh_pi_v2f32 __u;
1032 } __attribute__((__packed__, __may_alias__));
1033 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1034 #else
1035 __builtin_ia32_storehps(__p, __a);
1036 #endif
1037 }
1038
_mm_storel_pi(__m64 * __p,__m128 __a)1039 __INTRIN_INLINE_SSE void _mm_storel_pi(__m64 *__p, __m128 __a)
1040 {
1041 #ifdef __clang__
1042 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1043 struct __mm_storeh_pi_struct {
1044 __mm_storeh_pi_v2f32 __u;
1045 } __attribute__((__packed__, __may_alias__));
1046 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1047 #else
1048 __builtin_ia32_storelps(__p, __a);
1049 #endif
1050 }
1051
_mm_store_ss(float * __p,__m128 __a)1052 __INTRIN_INLINE_SSE void _mm_store_ss(float *__p, __m128 __a)
1053 {
1054 *__p = ((__v4sf)__a)[0];
1055 }
1056
_mm_storeu_ps(float * __p,__m128 __a)1057 __INTRIN_INLINE_SSE void _mm_storeu_ps(float *__p, __m128 __a)
1058 {
1059 *(__m128_u *)__p = __a;
1060 }
1061
_mm_store_ps(float * __p,__m128 __a)1062 __INTRIN_INLINE_SSE void _mm_store_ps(float *__p, __m128 __a)
1063 {
1064 *(__m128*)__p = __a;
1065 }
1066
1067 // _mm_store_ps1
_mm_store1_ps(float * __p,__m128 __a)1068 __INTRIN_INLINE_SSE void _mm_store1_ps(float *__p, __m128 __a)
1069 {
1070 // FIXME: Should we use a temp instead?
1071 #ifdef __clang__
1072 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
1073 #else
1074 __a = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,0,0,0));
1075 #endif
1076 _mm_store_ps(__p, __a);
1077 }
1078
_mm_storer_ps(float * __p,__m128 __a)1079 __INTRIN_INLINE_SSE void _mm_storer_ps(float *__p, __m128 __a)
1080 {
1081 #ifdef __clang__
1082 __m128 __tmp = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1083 #else
1084 __m128 __tmp = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3));
1085 #endif
1086 _mm_store_ps(__p, __tmp);
1087 }
1088
1089 /* GCC / Clang specific consants */
1090 #define _MM_HINT_NTA_ALT 0
1091 #define _MM_HINT_T0_ALT 3
1092 #define _MM_HINT_T1_ALT 2
1093 #define _MM_HINT_T2_ALT 1
1094 #define _MM_HINT_ENTA_ALT 4
1095
1096 // These are not supported yet
1097 //#define _MM_HINT_ET0_ALT 7
1098 //#define _MM_HINT_ET1_ALT 6
1099 //#define _MM_HINT_ET2_ALT 5
1100
1101 #define _MM_HINT_MS_TO_ALT(sel) \
1102 (((sel) == _MM_HINT_NTA) ? _MM_HINT_NTA_ALT : \
1103 ((sel) == _MM_HINT_T0) ? _MM_HINT_T0_ALT : \
1104 ((sel) == _MM_HINT_T1) ? _MM_HINT_T1_ALT : \
1105 ((sel) == _MM_HINT_T2) ? _MM_HINT_T2_ALT : \
1106 ((sel) == _MM_HINT_ENTA) ? _MM_HINT_ENTA_ALT : 0)
1107
1108 #ifdef _MSC_VER1
1109
1110 /* On clang-cl we have an intrinsic, but the constants are different */
1111 #pragma intrinsic(_mm_prefetch)
1112 #define _mm_prefetch(p, sel) _mm_prefetch(p, _MM_HINT_MS_TO_ALT(sel))
1113
1114 #else /* _MSC_VER */
1115
1116 #define _mm_prefetch(p, sel) \
1117 __builtin_prefetch((const void *)(p), (_MM_HINT_MS_TO_ALT(sel) >> 2) & 1, _MM_HINT_MS_TO_ALT(sel) & 0x3)
1118
1119 #endif /* _MSC_VER */
1120
_mm_stream_pi(__m64 * __p,__m64 __a)1121 __INTRIN_INLINE_SSE void _mm_stream_pi(__m64 *__p, __m64 __a)
1122 {
1123 #ifdef __clang__
1124 __builtin_ia32_movntq((__v1di*)__p, __a);
1125 #else
1126 __builtin_ia32_movntq((long long unsigned int *)__p, (long long unsigned int)__a);
1127 #endif
1128 }
1129
_mm_stream_ps(float * __p,__m128 __a)1130 __INTRIN_INLINE_SSE void _mm_stream_ps(float *__p, __m128 __a)
1131 {
1132 #ifdef __clang__
1133 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
1134 #else
1135 __builtin_ia32_movntps(__p, (__v4sf)__a);
1136 #endif
1137 }
1138
1139 #if !HAS_BUILTIN(_mm_sfence)
_mm_sfence(void)1140 __INTRIN_INLINE_SSE void _mm_sfence(void)
1141 {
1142 __builtin_ia32_sfence();
1143 }
1144 #endif
1145
1146 #ifdef __clang__
1147 #define _m_pextrw(a, n) \
1148 ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
1149
1150 #define _m_pinsrw(a, d, n) \
1151 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
1152 #else
1153 // _m_pextrw
_mm_extract_pi16(__m64 const __a,int const __n)1154 __INTRIN_INLINE_SSE int _mm_extract_pi16(__m64 const __a, int const __n)
1155 {
1156 return (unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)__a, __n);
1157 }
1158
1159 // _m_pinsrw
_mm_insert_pi16(__m64 const __a,int const __d,int const __n)1160 __INTRIN_INLINE_SSE __m64 _mm_insert_pi16 (__m64 const __a, int const __d, int const __n)
1161 {
1162 return (__m64)__builtin_ia32_vec_set_v4hi ((__v4hi)__a, __d, __n);
1163 }
1164
1165 #endif
1166
1167 // _m_pmaxsw
_mm_max_pi16(__m64 __a,__m64 __b)1168 __INTRIN_INLINE_SSE __m64 _mm_max_pi16(__m64 __a, __m64 __b)
1169 {
1170 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
1171 }
1172
1173 // _m_pmaxub
_mm_max_pu8(__m64 __a,__m64 __b)1174 __INTRIN_INLINE_SSE __m64 _mm_max_pu8(__m64 __a, __m64 __b)
1175 {
1176 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
1177 }
1178
1179 // _m_pminsw
_mm_min_pi16(__m64 __a,__m64 __b)1180 __INTRIN_INLINE_SSE __m64 _mm_min_pi16(__m64 __a, __m64 __b)
1181 {
1182 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
1183 }
1184
1185 // _m_pminub
_mm_min_pu8(__m64 __a,__m64 __b)1186 __INTRIN_INLINE_SSE __m64 _mm_min_pu8(__m64 __a, __m64 __b)
1187 {
1188 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
1189 }
1190
1191 // _m_pmovmskb
_mm_movemask_pi8(__m64 __a)1192 __INTRIN_INLINE_SSE int _mm_movemask_pi8(__m64 __a)
1193 {
1194 return __builtin_ia32_pmovmskb((__v8qi)__a);
1195 }
1196
1197 // _m_pmulhuw
_mm_mulhi_pu16(__m64 __a,__m64 __b)1198 __INTRIN_INLINE_SSE __m64 _mm_mulhi_pu16(__m64 __a, __m64 __b)
1199 {
1200 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
1201 }
1202
1203 #ifdef __clang__
1204 #define _m_pshufw(a, n) \
1205 ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
1206 #else
1207 // _m_pshufw
_mm_shuffle_pi16(__m64 __a,int const __n)1208 __INTRIN_INLINE_MMX __m64 _mm_shuffle_pi16 (__m64 __a, int const __n)
1209 {
1210 return (__m64) __builtin_ia32_pshufw ((__v4hi)__a, __n);
1211 }
1212 #endif
1213
1214 // _m_maskmovq
_mm_maskmove_si64(__m64 __d,__m64 __n,char * __p)1215 __INTRIN_INLINE_SSE void _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
1216 {
1217 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
1218 }
1219
1220 // _m_pavgb
_mm_avg_pu8(__m64 __a,__m64 __b)1221 __INTRIN_INLINE_SSE __m64 _mm_avg_pu8(__m64 __a, __m64 __b)
1222 {
1223 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
1224 }
1225
1226 // _m_pavgw
_mm_avg_pu16(__m64 __a,__m64 __b)1227 __INTRIN_INLINE_SSE __m64 _mm_avg_pu16(__m64 __a, __m64 __b)
1228 {
1229 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
1230 }
1231
1232 // _m_psadbw
_mm_sad_pu8(__m64 __a,__m64 __b)1233 __INTRIN_INLINE_SSE __m64 _mm_sad_pu8(__m64 __a, __m64 __b)
1234 {
1235 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
1236 }
1237
1238 #endif // __GNUC__
1239
1240 #ifdef __cplusplus
1241 }
1242 #endif // __cplusplus
1243
1244 #endif /* _INCLUDED_MM2 */
1245