1 /* 2 * xmmintrin.h 3 * 4 * This file is part of the ReactOS CRT package. 5 * 6 * Contributors: 7 * Timo Kreuzer (timo.kreuzer@reactos.org) 8 * 9 * THIS SOFTWARE IS NOT COPYRIGHTED 10 * 11 * This source code is offered for use in the public domain. You may 12 * use, modify or distribute it freely. 13 * 14 * This code is distributed in the hope that it will be useful but 15 * WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY 16 * DISCLAIMED. This includes but is not limited to warranties of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 18 * 19 */ 20 21 #pragma once 22 #ifndef _INCLUDED_MM2 23 #define _INCLUDED_MM2 24 25 #include <mmintrin.h> 26 27 #if defined(_MM2_FUNCTIONALITY) && !defined(_MM_FUNCTIONALITY) 28 #define _MM_FUNCTIONALITY 29 #endif 30 31 #if !defined _VCRT_BUILD && !defined _INC_MALLOC 32 #include <malloc.h> // For _mm_malloc() and _mm_free() 33 #endif 34 35 #ifdef __cplusplus 36 extern "C" { 37 #endif 38 39 #if defined(_MSC_VER) && !defined(__clang__) 40 41 typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128 42 { 43 float m128_f32[4]; 44 unsigned __int64 m128_u64[2]; 45 __int8 m128_i8[16]; 46 __int16 m128_i16[8]; 47 __int32 m128_i32[4]; 48 __int64 m128_i64[2]; 49 unsigned __int8 m128_u8[16]; 50 unsigned __int16 m128_u16[8]; 51 unsigned __int32 m128_u32[4]; 52 } __m128; 53 54 #define __ATTRIBUTE_SSE__ 55 56 #else /* _MSC_VER */ 57 58 typedef float __v4sf __attribute__((__vector_size__(16))); 59 typedef signed int __v4si __attribute__((__vector_size__(16))); 60 typedef unsigned int __v4su __attribute__((__vector_size__(16))); 61 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1))); 62 63 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); 64 65 #ifdef __clang__ 66 #define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"),__min_vector_width__(128))) 67 #else 68 #define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"))) 69 #endif 70 #define __INTRIN_INLINE_SSE __INTRIN_INLINE __ATTRIBUTE_SSE__ 71 72 #endif /* _MSC_VER */ 73 74 #define _MM_ALIGN16 _VCRT_ALIGN(16) 75 76 /* Constants for use with _mm_prefetch. */ 77 #define _MM_HINT_NTA 0 78 #define _MM_HINT_T0 1 79 #define _MM_HINT_T1 2 80 #define _MM_HINT_T2 3 81 #define _MM_HINT_ENTA 4 82 #if 0 // Not supported yet 83 #define _MM_HINT_ET0 5 84 #define _MM_HINT_ET1 6 85 #define _MM_HINT_ET2 7 86 #endif 87 88 /* Create a selector for use with the SHUFPS instruction. */ 89 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ 90 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 91 92 /* Bits in the MXCSR. */ 93 #define _MM_EXCEPT_MASK 0x003f 94 #define _MM_EXCEPT_INVALID 0x0001 95 #define _MM_EXCEPT_DENORM 0x0002 96 #define _MM_EXCEPT_DIV_ZERO 0x0004 97 #define _MM_EXCEPT_OVERFLOW 0x0008 98 #define _MM_EXCEPT_UNDERFLOW 0x0010 99 #define _MM_EXCEPT_INEXACT 0x0020 100 101 #define _MM_MASK_MASK 0x1f80 102 #define _MM_MASK_INVALID 0x0080 103 #define _MM_MASK_DENORM 0x0100 104 #define _MM_MASK_DIV_ZERO 0x0200 105 #define _MM_MASK_OVERFLOW 0x0400 106 #define _MM_MASK_UNDERFLOW 0x0800 107 #define _MM_MASK_INEXACT 0x1000 108 109 #define _MM_ROUND_MASK 0x6000 110 #define _MM_ROUND_NEAREST 0x0000 111 #define _MM_ROUND_DOWN 0x2000 112 #define _MM_ROUND_UP 0x4000 113 #define _MM_ROUND_TOWARD_ZERO 0x6000 114 115 #define _MM_FLUSH_ZERO_MASK 0x8000 116 #define _MM_FLUSH_ZERO_ON 0x8000 117 #define _MM_FLUSH_ZERO_OFF 0x0000 118 119 #ifdef __ICL 120 void* __cdecl _mm_malloc(size_t Size, size_t Al); 121 void __cdecl _mm_free(void* P); 122 #endif 123 124 void _mm_prefetch(_In_ char const* p, _In_ int i); 125 __m128 _mm_setzero_ps(void); 126 __m128 _mm_add_ss(__m128 a, __m128 b); 127 __m128 _mm_sub_ss(__m128 a, __m128 b); 128 __m128 _mm_mul_ss(__m128 a, __m128 b); 129 __m128 _mm_div_ss(__m128 a, __m128 b); 130 __m128 _mm_sqrt_ss(__m128 a); 131 __m128 _mm_rcp_ss(__m128 a); 132 __m128 _mm_rsqrt_ss(__m128 a); 133 __m128 _mm_min_ss(__m128 a, __m128 b); 134 __m128 _mm_max_ss(__m128 a, __m128 b); 135 __m128 _mm_add_ps(__m128 a, __m128 b); 136 __m128 _mm_sub_ps(__m128 a, __m128 b); 137 __m128 _mm_mul_ps(__m128 a, __m128 b); 138 __m128 _mm_div_ps(__m128 a, __m128 b); 139 __m128 _mm_sqrt_ps(__m128 a); 140 __m128 _mm_rcp_ps(__m128 a); 141 __m128 _mm_rsqrt_ps(__m128 a); 142 __m128 _mm_min_ps(__m128 a, __m128 b); 143 __m128 _mm_max_ps(__m128 a, __m128 b); 144 __m128 _mm_and_ps(__m128 a, __m128 b); 145 __m128 _mm_andnot_ps(__m128 a, __m128 b); 146 __m128 _mm_or_ps(__m128 a, __m128 b); 147 __m128 _mm_xor_ps(__m128 a, __m128 b); 148 __m128 _mm_cmpeq_ss(__m128 a, __m128 b); 149 __m128 _mm_cmplt_ss(__m128 a, __m128 b); 150 __m128 _mm_cmple_ss(__m128 a, __m128 b); 151 __m128 _mm_cmpgt_ss(__m128 a, __m128 b); 152 __m128 _mm_cmpge_ss(__m128 a, __m128 b); 153 __m128 _mm_cmpneq_ss(__m128 a, __m128 b); 154 __m128 _mm_cmpnlt_ss(__m128 a, __m128 b); 155 __m128 _mm_cmpnle_ss(__m128 a, __m128 b); 156 __m128 _mm_cmpngt_ss(__m128 a, __m128 b); 157 __m128 _mm_cmpnge_ss(__m128 a, __m128 b); 158 __m128 _mm_cmpord_ss(__m128 a, __m128 b); 159 __m128 _mm_cmpunord_ss(__m128 a, __m128 b); 160 __m128 _mm_cmpeq_ps(__m128 a, __m128 b); 161 __m128 _mm_cmplt_ps(__m128 a, __m128 b); 162 __m128 _mm_cmple_ps(__m128 a, __m128 b); 163 __m128 _mm_cmpgt_ps(__m128 a, __m128 b); 164 __m128 _mm_cmpge_ps(__m128 a, __m128 b); 165 __m128 _mm_cmpneq_ps(__m128 a, __m128 b); 166 __m128 _mm_cmpnlt_ps(__m128 a, __m128 b); 167 __m128 _mm_cmpnle_ps(__m128 a, __m128 b); 168 __m128 _mm_cmpngt_ps(__m128 a, __m128 b); 169 __m128 _mm_cmpnge_ps(__m128 a, __m128 b); 170 __m128 _mm_cmpord_ps(__m128 a, __m128 b); 171 __m128 _mm_cmpunord_ps(__m128 a, __m128 b); 172 int _mm_comieq_ss(__m128 a, __m128 b); 173 int _mm_comilt_ss(__m128 a, __m128 b); 174 int _mm_comile_ss(__m128 a, __m128 b); 175 int _mm_comigt_ss(__m128 a, __m128 b); 176 int _mm_comige_ss(__m128 a, __m128 b); 177 int _mm_comineq_ss(__m128 a, __m128 b); 178 int _mm_ucomieq_ss(__m128 a, __m128 b); 179 int _mm_ucomilt_ss(__m128 a, __m128 b); 180 int _mm_ucomile_ss(__m128 a, __m128 b); 181 int _mm_ucomigt_ss(__m128 a, __m128 b); 182 int _mm_ucomige_ss(__m128 a, __m128 b); 183 int _mm_ucomineq_ss(__m128 a, __m128 b); 184 int _mm_cvt_ss2si(__m128 a); 185 int _mm_cvtt_ss2si(__m128 a); 186 __m128 _mm_cvt_si2ss(__m128 a, int b); 187 #ifdef _M_IX86 188 __m64 _mm_cvt_ps2pi(__m128 a); 189 __m64 _mm_cvtt_ps2pi(__m128 a); 190 __m128 _mm_cvt_pi2ps(__m128 a, __m64 b); 191 #endif 192 __m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8); 193 __m128 _mm_unpackhi_ps(__m128 a, __m128 b); 194 __m128 _mm_unpacklo_ps(__m128 a, __m128 b); 195 __m128 _mm_loadh_pi(__m128 a, __m64 const* p); 196 void _mm_storeh_pi(__m64* p, __m128 a); 197 __m128 _mm_movehl_ps(__m128 a, __m128 b); 198 __m128 _mm_movelh_ps(__m128 a, __m128 b); 199 __m128 _mm_loadl_pi(__m128 a, __m64 const* p); 200 void _mm_storel_pi(__m64* p, __m128 a); 201 int _mm_movemask_ps(__m128 a); 202 unsigned int _mm_getcsr(void); 203 void _mm_setcsr(unsigned int a); 204 __m128 _mm_set_ss(float a); 205 __m128 _mm_set_ps1(float a); 206 __m128 _mm_load_ss(float const* p); 207 __m128 _mm_load_ps1(float const* p); 208 __m128 _mm_load_ps(float const* p); 209 __m128 _mm_loadu_ps(float const* p); 210 __m128 _mm_loadr_ps(float const* p); 211 __m128 _mm_set_ps(float e3, float e2, float e1, float e0); 212 __m128 _mm_setr_ps(float e3, float e2, float e1, float e0); 213 void _mm_store_ss(float* p, __m128 a); 214 float _mm_cvtss_f32(__m128 a); 215 void _mm_store_ps(float* p, __m128 a); 216 void _mm_storeu_ps(float* p, __m128 a); 217 void _mm_store_ps1(float* p, __m128 a); 218 void _mm_storer_ps(float* p, __m128 a); 219 __m128 _mm_move_ss(__m128 a, __m128 b); 220 #ifdef _M_IX86 221 int _m_pextrw(__m64 a, int imm8); 222 __m64 _m_pinsrw(__m64 a, int i, int imm8); 223 __m64 _m_pmaxsw(__m64 a, __m64 b); 224 __m64 _m_pmaxub(__m64 a, __m64 b); 225 __m64 _m_pminsw(__m64 a, __m64 b); 226 __m64 _m_pminub(__m64 a, __m64 b); 227 int _m_pmovmskb(__m64 a); 228 __m64 _m_pmulhuw(__m64 a, __m64 b); 229 __m64 _m_pshufw(__m64 a, int imm8); 230 void _m_maskmovq(__m64 a, __m64 b, char*); 231 __m64 _m_pavgb(__m64 a, __m64 b); 232 __m64 _m_pavgw(__m64 a, __m64 b); 233 __m64 _m_psadbw(__m64 a, __m64 b); 234 void _mm_stream_pi(__m64* p, __m64 a); 235 #endif 236 void _mm_stream_ps(float* p, __m128 a); 237 void _mm_sfence(void); 238 #ifdef _M_AMD64 239 __int64 _mm_cvtss_si64(__m128 a); 240 __int64 _mm_cvttss_si64(__m128 a); 241 __m128 _mm_cvtsi64_ss(__m128 a, __int64 b); 242 #endif 243 244 /* Alternate names */ 245 #define _mm_cvtss_si32 _mm_cvt_ss2si 246 #define _mm_cvttss_si32 _mm_cvtt_ss2si 247 #define _mm_cvtsi32_ss _mm_cvt_si2ss 248 #define _mm_set1_ps _mm_set_ps1 249 #define _mm_load1_ps _mm_load_ps1f 250 #define _mm_store1_ps _mm_store_ps1 251 #define _mm_cvtps_pi32 _mm_cvt_ps2pi 252 #define _mm_cvttps_pi32 _mm_cvtt_ps2pi 253 #define _mm_cvtpi32_ps _mm_cvt_pi2ps 254 #define _mm_extract_pi16 _m_pextrw 255 #define _mm_insert_pi16 _m_pinsrw 256 #define _mm_max_pi16 _m_pmaxsw 257 #define _mm_max_pu8 _m_pmaxub 258 #define _mm_min_pi16 _m_pminsw 259 #define _mm_min_pu8 _m_pminub 260 #define _mm_movemask_pi8 _m_pmovmskb 261 #define _mm_mulhi_pu16 _m_pmulhuw 262 #define _mm_shuffle_pi16 _m_pshufw 263 #define _mm_maskmove_si64 _m_maskmovq 264 #define _mm_avg_pu8 _m_pavgb 265 #define _mm_avg_pu16 _m_pavgw 266 #define _mm_sad_pu8 _m_psadbw 267 268 #ifdef _M_IX86 269 /* Inline functions from Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h */ 270 271 __ATTRIBUTE_SSE__ 272 static __inline __m128 _mm_cvtpi16_ps(__m64 __a) 273 { 274 __m64 __b, __c; 275 __m128 __r; 276 277 __b = _mm_setzero_si64(); 278 __b = _mm_cmpgt_pi16(__b, __a); 279 __c = _mm_unpackhi_pi16(__a, __b); 280 __r = _mm_setzero_ps(); 281 __r = _mm_cvtpi32_ps(__r, __c); 282 __r = _mm_movelh_ps(__r, __r); 283 __c = _mm_unpacklo_pi16(__a, __b); 284 __r = _mm_cvtpi32_ps(__r, __c); 285 286 return __r; 287 } 288 289 __ATTRIBUTE_SSE__ 290 static __inline __m128 _mm_cvtpu16_ps(__m64 __a) 291 { 292 __m64 __b, __c; 293 __m128 __r; 294 295 __b = _mm_setzero_si64(); 296 __c = _mm_unpackhi_pi16(__a, __b); 297 __r = _mm_setzero_ps(); 298 __r = _mm_cvtpi32_ps(__r, __c); 299 __r = _mm_movelh_ps(__r, __r); 300 __c = _mm_unpacklo_pi16(__a, __b); 301 __r = _mm_cvtpi32_ps(__r, __c); 302 303 return __r; 304 } 305 306 __ATTRIBUTE_SSE__ 307 static __inline __m128 _mm_cvtpi8_ps(__m64 __a) 308 { 309 __m64 __b; 310 311 __b = _mm_setzero_si64(); 312 __b = _mm_cmpgt_pi8(__b, __a); 313 __b = _mm_unpacklo_pi8(__a, __b); 314 315 return _mm_cvtpi16_ps(__b); 316 } 317 318 __ATTRIBUTE_SSE__ 319 static __inline __m128 _mm_cvtpu8_ps(__m64 __a) 320 { 321 __m64 __b; 322 323 __b = _mm_setzero_si64(); 324 __b = _mm_unpacklo_pi8(__a, __b); 325 326 return _mm_cvtpi16_ps(__b); 327 } 328 329 __ATTRIBUTE_SSE__ 330 static __inline __m128 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 331 { 332 __m128 __c; 333 334 __c = _mm_setzero_ps(); 335 __c = _mm_cvtpi32_ps(__c, __b); 336 __c = _mm_movelh_ps(__c, __c); 337 338 return _mm_cvtpi32_ps(__c, __a); 339 } 340 341 __ATTRIBUTE_SSE__ 342 static __inline __m64 _mm_cvtps_pi16(__m128 __a) 343 { 344 __m64 __b, __c; 345 346 __b = _mm_cvtps_pi32(__a); 347 __a = _mm_movehl_ps(__a, __a); 348 __c = _mm_cvtps_pi32(__a); 349 350 return _mm_packs_pi32(__b, __c); 351 } 352 353 __ATTRIBUTE_SSE__ 354 static __inline __m64 _mm_cvtps_pi8(__m128 __a) 355 { 356 __m64 __b, __c; 357 358 __b = _mm_cvtps_pi16(__a); 359 __c = _mm_setzero_si64(); 360 361 return _mm_packs_pi16(__b, __c); 362 } 363 364 #endif /* _M_IX86 */ 365 366 /* Transpose the 4x4 matrix composed of row[0-3]. */ 367 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 368 do { \ 369 __m128 t0 = _mm_unpacklo_ps(row0, row1); \ 370 __m128 t1 = _mm_unpacklo_ps(row2, row3); \ 371 __m128 t2 = _mm_unpackhi_ps(row0, row1); \ 372 __m128 t3 = _mm_unpackhi_ps(row2, row3); \ 373 (row0) = _mm_movelh_ps(t0, t1); \ 374 (row1) = _mm_movehl_ps(t1, t0); \ 375 (row2) = _mm_movelh_ps(t2, t3); \ 376 (row3) = _mm_movehl_ps(t3, t2); \ 377 } while (0) 378 379 #define _MM_GET_EXCEPTION_STATE() \ 380 (_mm_getcsr() & _MM_EXCEPT_MASK) 381 382 #define _MM_GET_EXCEPTION_MASK() \ 383 (_mm_getcsr() & _MM_MASK_MASK) 384 385 #define _MM_GET_ROUNDING_MODE() \ 386 (_mm_getcsr() & _MM_ROUND_MASK) 387 388 #define _MM_GET_FLUSH_ZERO_MODE() \ 389 (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 390 391 #define _MM_SET_EXCEPTION_STATE(__mask) \ 392 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (__mask)) 393 394 #define _MM_SET_EXCEPTION_MASK(__mask) \ 395 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (__mask)) 396 397 #define _MM_SET_ROUNDING_MODE(__mode) \ 398 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (__mode)) 399 400 #define _MM_SET_FLUSH_ZERO_MODE(__mode) \ 401 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (__mode)) 402 403 /* Use intrinsics on MSVC */ 404 #if defined(_MSC_VER) && !defined(__clang__) 405 #pragma intrinsic(_mm_prefetch) 406 #pragma intrinsic(_mm_setzero_ps) 407 #pragma intrinsic(_mm_add_ss) 408 #pragma intrinsic(_mm_sub_ss) 409 #pragma intrinsic(_mm_mul_ss) 410 #pragma intrinsic(_mm_div_ss) 411 #pragma intrinsic(_mm_sqrt_ss) 412 #pragma intrinsic(_mm_rcp_ss) 413 #pragma intrinsic(_mm_rsqrt_ss) 414 #pragma intrinsic(_mm_min_ss) 415 #pragma intrinsic(_mm_max_ss) 416 #pragma intrinsic(_mm_add_ps) 417 #pragma intrinsic(_mm_sub_ps) 418 #pragma intrinsic(_mm_mul_ps) 419 #pragma intrinsic(_mm_div_ps) 420 #pragma intrinsic(_mm_sqrt_ps) 421 #pragma intrinsic(_mm_rcp_ps) 422 #pragma intrinsic(_mm_rsqrt_ps) 423 #pragma intrinsic(_mm_min_ps) 424 #pragma intrinsic(_mm_max_ps) 425 #pragma intrinsic(_mm_and_ps) 426 #pragma intrinsic(_mm_andnot_ps) 427 #pragma intrinsic(_mm_or_ps) 428 #pragma intrinsic(_mm_xor_ps) 429 #pragma intrinsic(_mm_cmpeq_ss) 430 #pragma intrinsic(_mm_cmplt_ss) 431 #pragma intrinsic(_mm_cmple_ss) 432 #pragma intrinsic(_mm_cmpgt_ss) 433 #pragma intrinsic(_mm_cmpge_ss) 434 #pragma intrinsic(_mm_cmpneq_ss) 435 #pragma intrinsic(_mm_cmpnlt_ss) 436 #pragma intrinsic(_mm_cmpnle_ss) 437 #pragma intrinsic(_mm_cmpngt_ss) 438 #pragma intrinsic(_mm_cmpnge_ss) 439 #pragma intrinsic(_mm_cmpord_ss) 440 #pragma intrinsic(_mm_cmpunord_ss) 441 #pragma intrinsic(_mm_cmpeq_ps) 442 #pragma intrinsic(_mm_cmplt_ps) 443 #pragma intrinsic(_mm_cmple_ps) 444 #pragma intrinsic(_mm_cmpgt_ps) 445 #pragma intrinsic(_mm_cmpge_ps) 446 #pragma intrinsic(_mm_cmpneq_ps) 447 #pragma intrinsic(_mm_cmpnlt_ps) 448 #pragma intrinsic(_mm_cmpnle_ps) 449 #pragma intrinsic(_mm_cmpngt_ps) 450 #pragma intrinsic(_mm_cmpnge_ps) 451 #pragma intrinsic(_mm_cmpord_ps) 452 #pragma intrinsic(_mm_cmpunord_ps) 453 #pragma intrinsic(_mm_comieq_ss) 454 #pragma intrinsic(_mm_comilt_ss) 455 #pragma intrinsic(_mm_comile_ss) 456 #pragma intrinsic(_mm_comigt_ss) 457 #pragma intrinsic(_mm_comige_ss) 458 #pragma intrinsic(_mm_comineq_ss) 459 #pragma intrinsic(_mm_ucomieq_ss) 460 #pragma intrinsic(_mm_ucomilt_ss) 461 #pragma intrinsic(_mm_ucomile_ss) 462 #pragma intrinsic(_mm_ucomigt_ss) 463 #pragma intrinsic(_mm_ucomige_ss) 464 #pragma intrinsic(_mm_ucomineq_ss) 465 #pragma intrinsic(_mm_cvt_ss2si) 466 #pragma intrinsic(_mm_cvtt_ss2si) 467 #pragma intrinsic(_mm_cvt_si2ss) 468 #ifdef _M_IX86 469 #pragma intrinsic(_mm_cvt_ps2pi) 470 #pragma intrinsic(_mm_cvtt_ps2pi) 471 #pragma intrinsic(_mm_cvt_pi2ps) 472 #endif // _M_IX86 473 #pragma intrinsic(_mm_shuffle_ps) 474 #pragma intrinsic(_mm_unpackhi_ps) 475 #pragma intrinsic(_mm_unpacklo_ps) 476 #pragma intrinsic(_mm_loadh_pi) 477 #pragma intrinsic(_mm_storeh_pi) 478 #pragma intrinsic(_mm_movehl_ps) 479 #pragma intrinsic(_mm_movelh_ps) 480 #pragma intrinsic(_mm_loadl_pi) 481 #pragma intrinsic(_mm_storel_pi) 482 #pragma intrinsic(_mm_movemask_ps) 483 #pragma intrinsic(_mm_getcsr) 484 #pragma intrinsic(_mm_setcsr) 485 #pragma intrinsic(_mm_set_ss) 486 #pragma intrinsic(_mm_set_ps1) 487 #pragma intrinsic(_mm_load_ss) 488 #pragma intrinsic(_mm_load_ps1) 489 #pragma intrinsic(_mm_load_ps) 490 #pragma intrinsic(_mm_loadu_ps) 491 #pragma intrinsic(_mm_loadr_ps) 492 #pragma intrinsic(_mm_set_ps) 493 #pragma intrinsic(_mm_setr_ps) 494 #pragma intrinsic(_mm_store_ss) 495 #pragma intrinsic(_mm_cvtss_f32) 496 #pragma intrinsic(_mm_store_ps) 497 #pragma intrinsic(_mm_storeu_ps) 498 #pragma intrinsic(_mm_store_ps1) 499 #pragma intrinsic(_mm_storer_ps) 500 #pragma intrinsic(_mm_move_ss) 501 #ifdef _M_IX86 502 #pragma intrinsic(_m_pextrw) 503 #pragma intrinsic(_m_pinsrw) 504 #pragma intrinsic(_m_pmaxsw) 505 #pragma intrinsic(_m_pmaxub) 506 #pragma intrinsic(_m_pminsw) 507 #pragma intrinsic(_m_pminub) 508 #pragma intrinsic(_m_pmovmskb) 509 #pragma intrinsic(_m_pmulhuw) 510 #pragma intrinsic(_m_pshufw) 511 #pragma intrinsic(_m_maskmovq) 512 #pragma intrinsic(_m_pavgb) 513 #pragma intrinsic(_m_pavgw) 514 #pragma intrinsic(_m_psadbw) 515 #pragma intrinsic(_mm_stream_pi) 516 #endif // _M_IX86 517 #pragma intrinsic(_mm_stream_ps) 518 #pragma intrinsic(_mm_sfence) 519 #ifdef _M_AMD64 520 #pragma intrinsic(_mm_cvtss_si64) 521 #pragma intrinsic(_mm_cvttss_si64) 522 #pragma intrinsic(_mm_cvtsi64_ss) 523 #endif // _M_AMD64 524 525 #else /* _MSC_VER */ 526 527 /* 528 GCC: https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/xmmintrin.h 529 Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h 530 */ 531 532 /* Use inline functions on GCC/Clang */ 533 534 #if !HAS_BUILTIN(_mm_getcsr) 535 __INTRIN_INLINE_SSE unsigned int _mm_getcsr(void) 536 { 537 return __builtin_ia32_stmxcsr(); 538 } 539 #endif 540 541 #if !HAS_BUILTIN(_mm_setcsr) 542 __INTRIN_INLINE_SSE void _mm_setcsr(unsigned int a) 543 { 544 __builtin_ia32_ldmxcsr(a); 545 } 546 #endif 547 548 __INTRIN_INLINE_SSE __m128 _mm_add_ss(__m128 __a, __m128 __b) 549 { 550 __a[0] += __b[0]; 551 return __a; 552 } 553 554 __INTRIN_INLINE_SSE __m128 _mm_add_ps(__m128 __a, __m128 __b) 555 { 556 return (__m128)((__v4sf)__a + (__v4sf)__b); 557 } 558 559 __INTRIN_INLINE_SSE __m128 _mm_sub_ss(__m128 __a, __m128 __b) 560 { 561 __a[0] -= __b[0]; 562 return __a; 563 } 564 565 __INTRIN_INLINE_SSE __m128 _mm_sub_ps(__m128 __a, __m128 __b) 566 { 567 return (__m128)((__v4sf)__a - (__v4sf)__b); 568 } 569 570 __INTRIN_INLINE_SSE __m128 _mm_mul_ss(__m128 __a, __m128 __b) 571 { 572 __a[0] *= __b[0]; 573 return __a; 574 } 575 576 __INTRIN_INLINE_SSE __m128 _mm_mul_ps(__m128 __a, __m128 __b) 577 { 578 return (__m128)((__v4sf)__a * (__v4sf)__b); 579 } 580 581 __INTRIN_INLINE_SSE __m128 _mm_div_ss(__m128 __a, __m128 __b) 582 { 583 __a[0] /= __b[0]; 584 return __a; 585 } 586 587 __INTRIN_INLINE_SSE __m128 _mm_div_ps(__m128 __a, __m128 __b) 588 { 589 return (__m128)((__v4sf)__a / (__v4sf)__b); 590 } 591 592 __INTRIN_INLINE_SSE __m128 _mm_sqrt_ss(__m128 __a) 593 { 594 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); 595 } 596 597 __INTRIN_INLINE_SSE __m128 _mm_sqrt_ps(__m128 __a) 598 { 599 return __builtin_ia32_sqrtps((__v4sf)__a); 600 } 601 602 __INTRIN_INLINE_SSE __m128 _mm_rcp_ss(__m128 __a) 603 { 604 return (__m128)__builtin_ia32_rcpss((__v4sf)__a); 605 } 606 607 __INTRIN_INLINE_SSE __m128 _mm_rcp_ps(__m128 __a) 608 { 609 return (__m128)__builtin_ia32_rcpps((__v4sf)__a); 610 } 611 612 __INTRIN_INLINE_SSE __m128 _mm_rsqrt_ss(__m128 __a) 613 { 614 return __builtin_ia32_rsqrtss((__v4sf)__a); 615 } 616 617 __INTRIN_INLINE_SSE __m128 _mm_rsqrt_ps(__m128 __a) 618 { 619 return __builtin_ia32_rsqrtps((__v4sf)__a); 620 } 621 622 __INTRIN_INLINE_SSE __m128 _mm_min_ss(__m128 __a, __m128 __b) 623 { 624 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); 625 } 626 627 __INTRIN_INLINE_SSE __m128 _mm_min_ps(__m128 __a, __m128 __b) 628 { 629 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); 630 } 631 632 __INTRIN_INLINE_SSE __m128 _mm_max_ss(__m128 __a, __m128 __b) 633 { 634 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); 635 } 636 637 __INTRIN_INLINE_SSE __m128 _mm_max_ps(__m128 __a, __m128 __b) 638 { 639 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); 640 } 641 642 __INTRIN_INLINE_SSE __m128 _mm_and_ps(__m128 __a, __m128 __b) 643 { 644 return (__m128)((__v4su)__a & (__v4su)__b); 645 } 646 647 __INTRIN_INLINE_SSE __m128 _mm_andnot_ps(__m128 __a, __m128 __b) 648 { 649 return (__m128)(~(__v4su)__a & (__v4su)__b); 650 } 651 652 __INTRIN_INLINE_SSE __m128 _mm_or_ps(__m128 __a, __m128 __b) 653 { 654 return (__m128)((__v4su)__a | (__v4su)__b); 655 } 656 657 __INTRIN_INLINE_SSE __m128 _mm_xor_ps(__m128 __a, __m128 __b) 658 { 659 return (__m128)((__v4su)__a ^ (__v4su)__b); 660 } 661 662 __INTRIN_INLINE_SSE __m128 _mm_cmpeq_ss(__m128 __a, __m128 __b) 663 { 664 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); 665 } 666 667 __INTRIN_INLINE_SSE __m128 _mm_cmpeq_ps(__m128 __a, __m128 __b) 668 { 669 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); 670 } 671 672 __INTRIN_INLINE_SSE __m128 _mm_cmplt_ss(__m128 __a, __m128 __b) 673 { 674 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); 675 } 676 677 __INTRIN_INLINE_SSE __m128 _mm_cmplt_ps(__m128 __a, __m128 __b) 678 { 679 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); 680 } 681 682 __INTRIN_INLINE_SSE __m128 _mm_cmple_ss(__m128 __a, __m128 __b) 683 { 684 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); 685 } 686 687 __INTRIN_INLINE_SSE __m128 _mm_cmple_ps(__m128 __a, __m128 __b) 688 { 689 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); 690 } 691 692 __INTRIN_INLINE_SSE __m128 _mm_cmpgt_ss(__m128 __a, __m128 __b) 693 { 694 __v4sf temp = __builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a); 695 #ifdef __clang__ 696 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); 697 #else 698 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); 699 #endif 700 } 701 702 __INTRIN_INLINE_SSE __m128 _mm_cmpgt_ps(__m128 __a, __m128 __b) 703 { 704 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); 705 } 706 707 __INTRIN_INLINE_SSE __m128 _mm_cmpge_ss(__m128 __a, __m128 __b) 708 { 709 __v4sf temp = __builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a); 710 #ifdef __clang__ 711 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); 712 #else 713 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); 714 #endif 715 } 716 717 __INTRIN_INLINE_SSE __m128 _mm_cmpge_ps(__m128 __a, __m128 __b) 718 { 719 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); 720 } 721 722 __INTRIN_INLINE_SSE __m128 _mm_cmpneq_ss(__m128 __a, __m128 __b) 723 { 724 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); 725 } 726 727 __INTRIN_INLINE_SSE __m128 _mm_cmpneq_ps(__m128 __a, __m128 __b) 728 { 729 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); 730 } 731 732 __INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ss(__m128 __a, __m128 __b) 733 { 734 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); 735 } 736 737 __INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ps(__m128 __a, __m128 __b) 738 { 739 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); 740 } 741 742 __INTRIN_INLINE_SSE __m128 _mm_cmpnle_ss(__m128 __a, __m128 __b) 743 { 744 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); 745 } 746 747 __INTRIN_INLINE_SSE __m128 _mm_cmpnle_ps(__m128 __a, __m128 __b) 748 { 749 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); 750 } 751 752 __INTRIN_INLINE_SSE __m128 _mm_cmpngt_ss(__m128 __a, __m128 __b) 753 { 754 __v4sf temp = __builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a); 755 #ifdef __clang__ 756 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); 757 #else 758 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); 759 #endif 760 } 761 762 __INTRIN_INLINE_SSE __m128 _mm_cmpngt_ps(__m128 __a, __m128 __b) 763 { 764 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); 765 } 766 767 __INTRIN_INLINE_SSE __m128 _mm_cmpnge_ss(__m128 __a, __m128 __b) 768 { 769 __v4sf temp = (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a); 770 #ifdef __clang__ 771 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); 772 #else 773 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); 774 #endif 775 } 776 777 __INTRIN_INLINE_SSE __m128 _mm_cmpnge_ps(__m128 __a, __m128 __b) 778 { 779 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); 780 } 781 782 __INTRIN_INLINE_SSE __m128 _mm_cmpord_ss(__m128 __a, __m128 __b) 783 { 784 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); 785 } 786 787 __INTRIN_INLINE_SSE __m128 _mm_cmpord_ps(__m128 __a, __m128 __b) 788 { 789 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); 790 } 791 792 __INTRIN_INLINE_SSE __m128 _mm_cmpunord_ss(__m128 __a, __m128 __b) 793 { 794 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); 795 } 796 797 __INTRIN_INLINE_SSE __m128 _mm_cmpunord_ps(__m128 __a, __m128 __b) 798 { 799 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); 800 } 801 802 __INTRIN_INLINE_SSE int _mm_comieq_ss(__m128 __a, __m128 __b) 803 { 804 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); 805 } 806 807 __INTRIN_INLINE_SSE int _mm_comilt_ss(__m128 __a, __m128 __b) 808 { 809 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); 810 } 811 812 __INTRIN_INLINE_SSE int _mm_comile_ss(__m128 __a, __m128 __b) 813 { 814 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); 815 } 816 817 __INTRIN_INLINE_SSE int _mm_comigt_ss(__m128 __a, __m128 __b) 818 { 819 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); 820 } 821 822 __INTRIN_INLINE_SSE int _mm_comige_ss(__m128 __a, __m128 __b) 823 { 824 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); 825 } 826 827 __INTRIN_INLINE_SSE int _mm_comineq_ss(__m128 __a, __m128 __b) 828 { 829 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); 830 } 831 832 __INTRIN_INLINE_SSE int _mm_ucomieq_ss(__m128 __a, __m128 __b) 833 { 834 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); 835 } 836 837 __INTRIN_INLINE_SSE int _mm_ucomilt_ss(__m128 __a, __m128 __b) 838 { 839 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); 840 } 841 842 __INTRIN_INLINE_SSE int _mm_ucomile_ss(__m128 __a, __m128 __b) 843 { 844 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); 845 } 846 847 __INTRIN_INLINE_SSE int _mm_ucomigt_ss(__m128 __a, __m128 __b) 848 { 849 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); 850 } 851 852 __INTRIN_INLINE_SSE int _mm_ucomige_ss(__m128 __a, __m128 __b) 853 { 854 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); 855 } 856 857 __INTRIN_INLINE_SSE int _mm_ucomineq_ss(__m128 __a, __m128 __b) 858 { 859 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); 860 } 861 862 // _mm_cvt_ss2si 863 __INTRIN_INLINE_SSE int _mm_cvtss_si32(__m128 __a) 864 { 865 return __builtin_ia32_cvtss2si((__v4sf)__a); 866 } 867 868 #ifdef _M_AMD64 869 __INTRIN_INLINE_SSE long long _mm_cvtss_si64(__m128 __a) 870 { 871 return __builtin_ia32_cvtss2si64((__v4sf)__a); 872 } 873 #endif 874 875 // _mm_cvt_ps2pi 876 __INTRIN_INLINE_SSE __m64 _mm_cvtps_pi32(__m128 __a) 877 { 878 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); 879 } 880 881 // _mm_cvtt_ss2si 882 __INTRIN_INLINE_SSE int _mm_cvttss_si32(__m128 __a) 883 { 884 return __builtin_ia32_cvttss2si((__v4sf)__a); 885 } 886 887 #ifdef _M_AMD64 888 __INTRIN_INLINE_SSE long long _mm_cvttss_si64(__m128 __a) 889 { 890 return __builtin_ia32_cvttss2si64((__v4sf)__a); 891 } 892 #endif 893 894 // _mm_cvtt_ps2pi 895 __INTRIN_INLINE_SSE __m64 _mm_cvttps_pi32(__m128 __a) 896 { 897 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); 898 } 899 900 // _mm_cvt_si2ss 901 __INTRIN_INLINE_SSE __m128 _mm_cvtsi32_ss(__m128 __a, int __b) 902 { 903 __a[0] = __b; 904 return __a; 905 } 906 907 #ifdef _M_AMD64 908 __INTRIN_INLINE_SSE __m128 _mm_cvtsi64_ss(__m128 __a, long long __b) 909 { 910 __a[0] = __b; 911 return __a; 912 } 913 #endif 914 915 // _mm_cvt_pi2ps 916 __INTRIN_INLINE_SSE __m128 _mm_cvtpi32_ps(__m128 __a, __m64 __b) 917 { 918 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); 919 } 920 921 __INTRIN_INLINE_SSE float _mm_cvtss_f32(__m128 __a) 922 { 923 return __a[0]; 924 } 925 926 __INTRIN_INLINE_SSE __m128 _mm_loadh_pi(__m128 __a, const __m64 *__p) 927 { 928 #ifdef __clang__ 929 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 930 struct __mm_loadh_pi_struct { 931 __mm_loadh_pi_v2f32 __u; 932 } __attribute__((__packed__, __may_alias__)); 933 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u; 934 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 935 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 936 #else 937 return (__m128)__builtin_ia32_loadhps(__a, __p); 938 #endif 939 } 940 941 __INTRIN_INLINE_SSE __m128 _mm_loadl_pi(__m128 __a, const __m64 *__p) 942 { 943 #ifdef __clang__ 944 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 945 struct __mm_loadl_pi_struct { 946 __mm_loadl_pi_v2f32 __u; 947 } __attribute__((__packed__, __may_alias__)); 948 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u; 949 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 950 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 951 #else 952 return (__m128)__builtin_ia32_loadlps(__a, __p); 953 #endif 954 } 955 956 __INTRIN_INLINE_SSE __m128 _mm_load_ss(const float *__p) 957 { 958 return _mm_set_ss(*__p); 959 } 960 961 // _mm_load_ps1 962 __INTRIN_INLINE_SSE __m128 _mm_load1_ps(const float *__p) 963 { 964 return _mm_set1_ps(*__p); 965 } 966 967 __INTRIN_INLINE_SSE __m128 _mm_load_ps(const float *__p) 968 { 969 return *(const __m128*)__p; 970 } 971 972 __INTRIN_INLINE_SSE __m128 _mm_loadu_ps(const float *__p) 973 { 974 struct __loadu_ps { 975 __m128_u __v; 976 } __attribute__((__packed__, __may_alias__)); 977 return ((const struct __loadu_ps*)__p)->__v; 978 } 979 980 __INTRIN_INLINE_SSE __m128 _mm_loadr_ps(const float *__p) 981 { 982 __m128 __a = _mm_load_ps(__p); 983 #ifdef __clang__ 984 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 985 #else 986 return (__m128)__builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3)); 987 #endif 988 } 989 990 __INTRIN_INLINE_SSE __m128 _mm_undefined_ps(void) 991 { 992 #ifdef __clang__ 993 return (__m128)__builtin_ia32_undef128(); 994 #else 995 __m128 undef = undef; 996 return undef; 997 #endif 998 } 999 1000 __INTRIN_INLINE_SSE __m128 _mm_set_ss(float __w) 1001 { 1002 return __extension__ (__m128){ __w, 0, 0, 0 }; 1003 } 1004 1005 // _mm_set_ps1 1006 __INTRIN_INLINE_SSE __m128 _mm_set1_ps(float __w) 1007 { 1008 return __extension__ (__m128){ __w, __w, __w, __w }; 1009 } 1010 1011 __INTRIN_INLINE_SSE __m128 _mm_set_ps(float __z, float __y, float __x, float __w) 1012 { 1013 return __extension__ (__m128){ __w, __x, __y, __z }; 1014 } 1015 1016 __INTRIN_INLINE_SSE __m128 _mm_setr_ps(float __z, float __y, float __x, float __w) 1017 { 1018 return __extension__ (__m128){ __z, __y, __x, __w }; 1019 } 1020 1021 __INTRIN_INLINE_SSE __m128 _mm_setzero_ps(void) 1022 { 1023 return __extension__ (__m128){ 0, 0, 0, 0 }; 1024 } 1025 1026 __INTRIN_INLINE_SSE void _mm_storeh_pi(__m64 *__p, __m128 __a) 1027 { 1028 #ifdef __clang__ 1029 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); 1030 struct __mm_storeh_pi_struct { 1031 __mm_storeh_pi_v2f32 __u; 1032 } __attribute__((__packed__, __may_alias__)); 1033 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3); 1034 #else 1035 __builtin_ia32_storehps(__p, __a); 1036 #endif 1037 } 1038 1039 __INTRIN_INLINE_SSE void _mm_storel_pi(__m64 *__p, __m128 __a) 1040 { 1041 #ifdef __clang__ 1042 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); 1043 struct __mm_storeh_pi_struct { 1044 __mm_storeh_pi_v2f32 __u; 1045 } __attribute__((__packed__, __may_alias__)); 1046 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1); 1047 #else 1048 __builtin_ia32_storelps(__p, __a); 1049 #endif 1050 } 1051 1052 __INTRIN_INLINE_SSE void _mm_store_ss(float *__p, __m128 __a) 1053 { 1054 *__p = ((__v4sf)__a)[0]; 1055 } 1056 1057 __INTRIN_INLINE_SSE void _mm_storeu_ps(float *__p, __m128 __a) 1058 { 1059 *(__m128_u *)__p = __a; 1060 } 1061 1062 __INTRIN_INLINE_SSE void _mm_store_ps(float *__p, __m128 __a) 1063 { 1064 *(__m128*)__p = __a; 1065 } 1066 1067 // _mm_store_ps1 1068 __INTRIN_INLINE_SSE void _mm_store1_ps(float *__p, __m128 __a) 1069 { 1070 // FIXME: Should we use a temp instead? 1071 #ifdef __clang__ 1072 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); 1073 #else 1074 __a = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,0,0,0)); 1075 #endif 1076 _mm_store_ps(__p, __a); 1077 } 1078 1079 __INTRIN_INLINE_SSE void _mm_storer_ps(float *__p, __m128 __a) 1080 { 1081 #ifdef __clang__ 1082 __m128 __tmp = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 1083 #else 1084 __m128 __tmp = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3)); 1085 #endif 1086 _mm_store_ps(__p, __tmp); 1087 } 1088 1089 /* GCC / Clang specific consants */ 1090 #define _MM_HINT_NTA_ALT 0 1091 #define _MM_HINT_T0_ALT 3 1092 #define _MM_HINT_T1_ALT 2 1093 #define _MM_HINT_T2_ALT 1 1094 #define _MM_HINT_ENTA_ALT 4 1095 1096 // These are not supported yet 1097 //#define _MM_HINT_ET0_ALT 7 1098 //#define _MM_HINT_ET1_ALT 6 1099 //#define _MM_HINT_ET2_ALT 5 1100 1101 #define _MM_HINT_MS_TO_ALT(sel) \ 1102 (((sel) == _MM_HINT_NTA) ? _MM_HINT_NTA_ALT : \ 1103 ((sel) == _MM_HINT_T0) ? _MM_HINT_T0_ALT : \ 1104 ((sel) == _MM_HINT_T1) ? _MM_HINT_T1_ALT : \ 1105 ((sel) == _MM_HINT_T2) ? _MM_HINT_T2_ALT : \ 1106 ((sel) == _MM_HINT_ENTA) ? _MM_HINT_ENTA_ALT : 0) 1107 1108 #ifdef _MSC_VER1 1109 1110 /* On clang-cl we have an intrinsic, but the constants are different */ 1111 #pragma intrinsic(_mm_prefetch) 1112 #define _mm_prefetch(p, sel) _mm_prefetch(p, _MM_HINT_MS_TO_ALT(sel)) 1113 1114 #else /* _MSC_VER */ 1115 1116 #define _mm_prefetch(p, sel) \ 1117 __builtin_prefetch((const void *)(p), (_MM_HINT_MS_TO_ALT(sel) >> 2) & 1, _MM_HINT_MS_TO_ALT(sel) & 0x3) 1118 1119 #endif /* _MSC_VER */ 1120 1121 __INTRIN_INLINE_SSE void _mm_stream_pi(__m64 *__p, __m64 __a) 1122 { 1123 #ifdef __clang__ 1124 __builtin_ia32_movntq((__v1di*)__p, __a); 1125 #else 1126 __builtin_ia32_movntq((long long unsigned int *)__p, (long long unsigned int)__a); 1127 #endif 1128 } 1129 1130 __INTRIN_INLINE_SSE void _mm_stream_ps(float *__p, __m128 __a) 1131 { 1132 #ifdef __clang__ 1133 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); 1134 #else 1135 __builtin_ia32_movntps(__p, (__v4sf)__a); 1136 #endif 1137 } 1138 1139 #if !HAS_BUILTIN(_mm_sfence) 1140 __INTRIN_INLINE_SSE void _mm_sfence(void) 1141 { 1142 __builtin_ia32_sfence(); 1143 } 1144 #endif 1145 1146 #ifdef __clang__ 1147 #define _m_pextrw(a, n) \ 1148 ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)) 1149 1150 #define _m_pinsrw(a, d, n) \ 1151 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)) 1152 #else 1153 // _m_pextrw 1154 __INTRIN_INLINE_SSE int _mm_extract_pi16(__m64 const __a, int const __n) 1155 { 1156 return (unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)__a, __n); 1157 } 1158 1159 // _m_pinsrw 1160 __INTRIN_INLINE_SSE __m64 _mm_insert_pi16 (__m64 const __a, int const __d, int const __n) 1161 { 1162 return (__m64)__builtin_ia32_vec_set_v4hi ((__v4hi)__a, __d, __n); 1163 } 1164 1165 #endif 1166 1167 // _m_pmaxsw 1168 __INTRIN_INLINE_SSE __m64 _mm_max_pi16(__m64 __a, __m64 __b) 1169 { 1170 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 1171 } 1172 1173 // _m_pmaxub 1174 __INTRIN_INLINE_SSE __m64 _mm_max_pu8(__m64 __a, __m64 __b) 1175 { 1176 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 1177 } 1178 1179 // _m_pminsw 1180 __INTRIN_INLINE_SSE __m64 _mm_min_pi16(__m64 __a, __m64 __b) 1181 { 1182 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 1183 } 1184 1185 // _m_pminub 1186 __INTRIN_INLINE_SSE __m64 _mm_min_pu8(__m64 __a, __m64 __b) 1187 { 1188 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 1189 } 1190 1191 // _m_pmovmskb 1192 __INTRIN_INLINE_SSE int _mm_movemask_pi8(__m64 __a) 1193 { 1194 return __builtin_ia32_pmovmskb((__v8qi)__a); 1195 } 1196 1197 // _m_pmulhuw 1198 __INTRIN_INLINE_SSE __m64 _mm_mulhi_pu16(__m64 __a, __m64 __b) 1199 { 1200 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 1201 } 1202 1203 #ifdef __clang__ 1204 #define _m_pshufw(a, n) \ 1205 ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))) 1206 #else 1207 // _m_pshufw 1208 __INTRIN_INLINE_MMX __m64 _mm_shuffle_pi16 (__m64 __a, int const __n) 1209 { 1210 return (__m64) __builtin_ia32_pshufw ((__v4hi)__a, __n); 1211 } 1212 #endif 1213 1214 // _m_maskmovq 1215 __INTRIN_INLINE_SSE void _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 1216 { 1217 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 1218 } 1219 1220 // _m_pavgb 1221 __INTRIN_INLINE_SSE __m64 _mm_avg_pu8(__m64 __a, __m64 __b) 1222 { 1223 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 1224 } 1225 1226 // _m_pavgw 1227 __INTRIN_INLINE_SSE __m64 _mm_avg_pu16(__m64 __a, __m64 __b) 1228 { 1229 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 1230 } 1231 1232 // _m_psadbw 1233 __INTRIN_INLINE_SSE __m64 _mm_sad_pu8(__m64 __a, __m64 __b) 1234 { 1235 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 1236 } 1237 1238 #endif // __GNUC__ 1239 1240 #ifdef __cplusplus 1241 } 1242 #endif // __cplusplus 1243 1244 #endif /* _INCLUDED_MM2 */ 1245