1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #pragma once 11 #ifndef _INCLUDED_EMM 12 #define _INCLUDED_EMM 13 14 #include <vcruntime.h> 15 #include <xmmintrin.h> 16 17 #if defined(_MSC_VER) && !defined(__clang__) 18 19 typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128i 20 { 21 __int8 m128i_i8[16]; 22 __int16 m128i_i16[8]; 23 __int32 m128i_i32[4]; 24 __int64 m128i_i64[2]; 25 unsigned __int8 m128i_u8[16]; 26 unsigned __int16 m128i_u16[8]; 27 unsigned __int32 m128i_u32[4]; 28 unsigned __int64 m128i_u64[2]; 29 } __m128i; 30 _STATIC_ASSERT(sizeof(__m128i) == 16); 31 32 typedef struct _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128d 33 { 34 double m128d_f64[2]; 35 } __m128d; 36 37 typedef __declspec(align(1)) __m128i __m128i_u; 38 39 #define __ATTRIBUTE_SSE2__ 40 41 #else /* _MSC_VER */ 42 43 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); 44 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); 45 46 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); 47 typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1))); 48 49 /* Type defines. */ 50 typedef double __v2df __attribute__((__vector_size__(16))); 51 typedef long long __v2di __attribute__((__vector_size__(16))); 52 typedef short __v8hi __attribute__((__vector_size__(16))); 53 typedef char __v16qi __attribute__((__vector_size__(16))); 54 55 /* Unsigned types */ 56 typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 57 typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 58 typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 59 60 /* We need an explicitly signed variant for char. Note that this shouldn't 61 * appear in the interface though. */ 62 typedef signed char __v16qs __attribute__((__vector_size__(16))); 63 64 #ifdef __clang__ 65 #define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"),__min_vector_width__(128))) 66 #define __ATTRIBUTE_MMXSSE2__ __attribute__((__target__("mmx,sse2"),__min_vector_width__(128))) 67 #else 68 #define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"))) 69 #define __ATTRIBUTE_MMXSSE2__ __attribute__((__target__("mmx,sse2"))) 70 #endif 71 #define __INTRIN_INLINE_SSE2 __INTRIN_INLINE __ATTRIBUTE_SSE2__ 72 #define __INTRIN_INLINE_MMXSSE2 __INTRIN_INLINE __ATTRIBUTE_MMXSSE2__ 73 74 #endif /* _MSC_VER */ 75 76 #ifdef __cplusplus 77 extern "C" { 78 #endif 79 80 extern __m128d _mm_add_sd(__m128d a, __m128d b); 81 extern __m128d _mm_add_pd(__m128d a, __m128d b); 82 extern __m128d _mm_sub_sd(__m128d a, __m128d b); 83 extern __m128d _mm_sub_pd(__m128d a, __m128d b); 84 extern __m128d _mm_mul_sd(__m128d a, __m128d b); 85 extern __m128d _mm_mul_pd(__m128d a, __m128d b); 86 extern __m128d _mm_div_sd(__m128d a, __m128d b); 87 extern __m128d _mm_div_pd(__m128d a, __m128d b); 88 extern __m128d _mm_sqrt_sd(__m128d a, __m128d b); 89 extern __m128d _mm_sqrt_pd(__m128d a); 90 extern __m128d _mm_min_sd(__m128d a, __m128d b); 91 extern __m128d _mm_min_pd(__m128d a, __m128d b); 92 extern __m128d _mm_max_sd(__m128d a, __m128d b); 93 extern __m128d _mm_max_pd(__m128d a, __m128d b); 94 extern __m128d _mm_and_pd(__m128d a, __m128d b); 95 extern __m128d _mm_andnot_pd(__m128d a, __m128d b); 96 extern __m128d _mm_or_pd(__m128d a, __m128d b); 97 extern __m128d _mm_xor_pd(__m128d a, __m128d b); 98 extern __m128d _mm_cmpeq_pd(__m128d a, __m128d b); 99 extern __m128d _mm_cmplt_pd(__m128d a, __m128d b); 100 extern __m128d _mm_cmple_pd(__m128d a, __m128d b); 101 extern __m128d _mm_cmpgt_pd(__m128d a, __m128d b); 102 extern __m128d _mm_cmpge_pd(__m128d a, __m128d b); 103 extern __m128d _mm_cmpord_pd(__m128d a, __m128d b); 104 extern __m128d _mm_cmpunord_pd(__m128d a, __m128d b); 105 extern __m128d _mm_cmpneq_pd(__m128d a, __m128d b); 106 extern __m128d _mm_cmpnlt_pd(__m128d a, __m128d b); 107 extern __m128d _mm_cmpnle_pd(__m128d a, __m128d b); 108 extern __m128d _mm_cmpngt_pd(__m128d a, __m128d b); 109 extern __m128d _mm_cmpnge_pd(__m128d a, __m128d b); 110 extern __m128d _mm_cmpeq_sd(__m128d a, __m128d b); 111 extern __m128d _mm_cmplt_sd(__m128d a, __m128d b); 112 extern __m128d _mm_cmple_sd(__m128d a, __m128d b); 113 extern __m128d _mm_cmpgt_sd(__m128d a, __m128d b); 114 extern __m128d _mm_cmpge_sd(__m128d a, __m128d b); 115 extern __m128d _mm_cmpord_sd(__m128d a, __m128d b); 116 extern __m128d _mm_cmpunord_sd(__m128d a, __m128d b); 117 extern __m128d _mm_cmpneq_sd(__m128d a, __m128d b); 118 extern __m128d _mm_cmpnlt_sd(__m128d a, __m128d b); 119 extern __m128d _mm_cmpnle_sd(__m128d a, __m128d b); 120 extern __m128d _mm_cmpngt_sd(__m128d a, __m128d b); 121 extern __m128d _mm_cmpnge_sd(__m128d a, __m128d b); 122 extern int _mm_comieq_sd(__m128d a, __m128d b); 123 extern int _mm_comilt_sd(__m128d a, __m128d b); 124 extern int _mm_comile_sd(__m128d a, __m128d b); 125 extern int _mm_comigt_sd(__m128d a, __m128d b); 126 extern int _mm_comige_sd(__m128d a, __m128d b); 127 extern int _mm_comineq_sd(__m128d a, __m128d b); 128 extern int _mm_ucomieq_sd(__m128d a, __m128d b); 129 extern int _mm_ucomilt_sd(__m128d a, __m128d b); 130 extern int _mm_ucomile_sd(__m128d a, __m128d b); 131 extern int _mm_ucomigt_sd(__m128d a, __m128d b); 132 extern int _mm_ucomige_sd(__m128d a, __m128d b); 133 extern int _mm_ucomineq_sd(__m128d a, __m128d b); 134 extern __m128 _mm_cvtpd_ps(__m128d a); 135 extern __m128d _mm_cvtps_pd(__m128 a); 136 extern __m128d _mm_cvtepi32_pd(__m128i a); 137 extern __m128i _mm_cvtpd_epi32(__m128d a); 138 extern int _mm_cvtsd_si32(__m128d a); 139 extern __m128 _mm_cvtsd_ss(__m128 a, __m128d b); 140 extern __m128d _mm_cvtsi32_sd(__m128d a, int b); 141 extern __m128d _mm_cvtss_sd(__m128d a, __m128 b); 142 extern __m128i _mm_cvttpd_epi32(__m128d a); 143 extern int _mm_cvttsd_si32(__m128d a); 144 extern __m64 _mm_cvtpd_pi32(__m128d a); 145 extern __m64 _mm_cvttpd_pi32(__m128d a); 146 extern __m128d _mm_cvtpi32_pd(__m64 a); 147 extern double _mm_cvtsd_f64(__m128d a); 148 extern __m128d _mm_load_pd(double const *dp); 149 extern __m128d _mm_load1_pd(double const *dp); 150 extern __m128d _mm_loadr_pd(double const *dp); 151 extern __m128d _mm_loadu_pd(double const *dp); 152 //extern __m128i _mm_loadu_si64(void const *a); 153 //extern __m128i _mm_loadu_si32(void const *a); 154 //extern __m128i _mm_loadu_si16(void const *a); 155 extern __m128d _mm_load_sd(double const *dp); 156 extern __m128d _mm_loadh_pd(__m128d a, double const *dp); 157 extern __m128d _mm_loadl_pd(__m128d a, double const *dp); 158 //extern __m128d _mm_undefined_pd(void); 159 extern __m128d _mm_set_sd(double w); 160 extern __m128d _mm_set1_pd(double w); 161 extern __m128d _mm_set_pd(double w, double x); 162 extern __m128d _mm_setr_pd(double w, double x); 163 extern __m128d _mm_setzero_pd(void); 164 extern __m128d _mm_move_sd(__m128d a, __m128d b); 165 extern void _mm_store_sd(double *dp, __m128d a); 166 extern void _mm_store_pd(double *dp, __m128d a); 167 extern void _mm_store1_pd(double *dp, __m128d a); 168 extern void _mm_storeu_pd(double *dp, __m128d a); 169 extern void _mm_storer_pd(double *dp, __m128d a); 170 extern void _mm_storeh_pd(double *dp, __m128d a); 171 extern void _mm_storel_pd(double *dp, __m128d a); 172 extern __m128i _mm_add_epi8(__m128i a, __m128i b); 173 extern __m128i _mm_add_epi16(__m128i a, __m128i b); 174 extern __m128i _mm_add_epi32(__m128i a, __m128i b); 175 extern __m64 _mm_add_si64(__m64 a, __m64 b); 176 extern __m128i _mm_add_epi64(__m128i a, __m128i b); 177 extern __m128i _mm_adds_epi8(__m128i a, __m128i b); 178 extern __m128i _mm_adds_epi16(__m128i a, __m128i b); 179 extern __m128i _mm_adds_epu8(__m128i a, __m128i b); 180 extern __m128i _mm_adds_epu16(__m128i a, __m128i b); 181 extern __m128i _mm_avg_epu8(__m128i a, __m128i b); 182 extern __m128i _mm_avg_epu16(__m128i a, __m128i b); 183 extern __m128i _mm_madd_epi16(__m128i a, __m128i b); 184 extern __m128i _mm_max_epi16(__m128i a, __m128i b); 185 extern __m128i _mm_max_epu8(__m128i a, __m128i b); 186 extern __m128i _mm_min_epi16(__m128i a, __m128i b); 187 extern __m128i _mm_min_epu8(__m128i a, __m128i b); 188 extern __m128i _mm_mulhi_epi16(__m128i a, __m128i b); 189 extern __m128i _mm_mulhi_epu16(__m128i a, __m128i b); 190 extern __m128i _mm_mullo_epi16(__m128i a, __m128i b); 191 extern __m64 _mm_mul_su32(__m64 a, __m64 b); 192 extern __m128i _mm_mul_epu32(__m128i a, __m128i b); 193 extern __m128i _mm_sad_epu8(__m128i a, __m128i b); 194 extern __m128i _mm_sub_epi8(__m128i a, __m128i b); 195 extern __m128i _mm_sub_epi16(__m128i a, __m128i b); 196 extern __m128i _mm_sub_epi32(__m128i a, __m128i b); 197 extern __m64 _mm_sub_si64(__m64 a, __m64 b); 198 extern __m128i _mm_sub_epi64(__m128i a, __m128i b); 199 extern __m128i _mm_subs_epi8(__m128i a, __m128i b); 200 extern __m128i _mm_subs_epi16(__m128i a, __m128i b); 201 extern __m128i _mm_subs_epu8(__m128i a, __m128i b); 202 extern __m128i _mm_subs_epu16(__m128i a, __m128i b); 203 extern __m128i _mm_and_si128(__m128i a, __m128i b); 204 extern __m128i _mm_andnot_si128(__m128i a, __m128i b); 205 extern __m128i _mm_or_si128(__m128i a, __m128i b); 206 extern __m128i _mm_xor_si128(__m128i a, __m128i b); 207 extern __m128i _mm_slli_si128(__m128i a, int i); 208 extern __m128i _mm_slli_epi16(__m128i a, int count); 209 extern __m128i _mm_sll_epi16(__m128i a, __m128i count); 210 extern __m128i _mm_slli_epi32(__m128i a, int count); 211 extern __m128i _mm_sll_epi32(__m128i a, __m128i count); 212 extern __m128i _mm_slli_epi64(__m128i a, int count); 213 extern __m128i _mm_sll_epi64(__m128i a, __m128i count); 214 extern __m128i _mm_srai_epi16(__m128i a, int count); 215 extern __m128i _mm_sra_epi16(__m128i a, __m128i count); 216 extern __m128i _mm_srai_epi32(__m128i a, int count); 217 extern __m128i _mm_sra_epi32(__m128i a, __m128i count); 218 extern __m128i _mm_srli_si128(__m128i a, int imm); 219 extern __m128i _mm_srli_epi16(__m128i a, int count); 220 extern __m128i _mm_srl_epi16(__m128i a, __m128i count); 221 extern __m128i _mm_srli_epi32(__m128i a, int count); 222 extern __m128i _mm_srl_epi32(__m128i a, __m128i count); 223 extern __m128i _mm_srli_epi64(__m128i a, int count); 224 extern __m128i _mm_srl_epi64(__m128i a, __m128i count); 225 extern __m128i _mm_cmpeq_epi8(__m128i a, __m128i b); 226 extern __m128i _mm_cmpeq_epi16(__m128i a, __m128i b); 227 extern __m128i _mm_cmpeq_epi32(__m128i a, __m128i b); 228 extern __m128i _mm_cmpgt_epi8(__m128i a, __m128i b); 229 extern __m128i _mm_cmpgt_epi16(__m128i a, __m128i b); 230 extern __m128i _mm_cmpgt_epi32(__m128i a, __m128i b); 231 extern __m128i _mm_cmplt_epi8(__m128i a, __m128i b); 232 extern __m128i _mm_cmplt_epi16(__m128i a, __m128i b); 233 extern __m128i _mm_cmplt_epi32(__m128i a, __m128i b); 234 #ifdef _M_AMD64 235 extern __m128d _mm_cvtsi64_sd(__m128d a, long long b); 236 extern long long _mm_cvtsd_si64(__m128d a); 237 extern long long _mm_cvttsd_si64(__m128d a); 238 #endif 239 extern __m128 _mm_cvtepi32_ps(__m128i a); 240 extern __m128i _mm_cvtps_epi32(__m128 a); 241 extern __m128i _mm_cvttps_epi32(__m128 a); 242 extern __m128i _mm_cvtsi32_si128(int a); 243 #ifdef _M_AMD64 244 extern __m128i _mm_cvtsi64_si128(long long a); 245 #endif 246 extern int _mm_cvtsi128_si32(__m128i a); 247 #ifdef _M_AMD64 248 extern long long _mm_cvtsi128_si64(__m128i a); 249 #endif 250 extern __m128i _mm_load_si128(__m128i const *p); 251 extern __m128i _mm_loadu_si128(__m128i_u const *p); 252 extern __m128i _mm_loadl_epi64(__m128i_u const *p); 253 //extern __m128i _mm_undefined_si128(void); 254 //extern __m128i _mm_set_epi64x(long long q1, long long q0); // FIXME 255 extern __m128i _mm_set_epi64(__m64 q1, __m64 q0); 256 //extern __m128i _mm_set_epi32(int i3, int i1, int i0); 257 extern __m128i _mm_set_epi32(int i3, int i2, int i1, int i0); 258 //extern __m128i _mm_set_epi16(short w7, short w2, short w1, short w0); 259 extern __m128i _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0); 260 //extern __m128i _mm_set_epi8(char b15, char b10, char b4, char b3, char b2, char b1, char b0); 261 extern __m128i _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0); 262 //extern __m128i _mm_set1_epi64x(long long q); // FIXME 263 extern __m128i _mm_set1_epi64(__m64 q); 264 extern __m128i _mm_set1_epi32(int i); 265 extern __m128i _mm_set1_epi16(short w); 266 extern __m128i _mm_set1_epi8(char b); 267 extern __m128i _mm_setl_epi64(__m128i q); // FIXME: clang? 268 extern __m128i _mm_setr_epi64(__m64 q0, __m64 q1); 269 //extern __m128i _mm_setr_epi32(int i0, int i2, int i3); 270 extern __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3); 271 //extern __m128i _mm_setr_epi16(short w0, short w5, short w6, short w7); 272 extern __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7); 273 //extern __m128i _mm_setr_epi8(char b0, char b6, char b11, char b12, char b13, char b14, char b15); 274 extern __m128i _mm_setr_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0); 275 extern __m128i _mm_setzero_si128(void); 276 extern void _mm_store_si128(__m128i *p, __m128i b); 277 extern void _mm_storeu_si128(__m128i_u *p, __m128i b); 278 //extern void _mm_storeu_si64(void *p, __m128i b); 279 //extern void _mm_storeu_si32(void *p, __m128i b); 280 //extern void _mm_storeu_si16(void *p, __m128i b); 281 extern void _mm_maskmoveu_si128(__m128i d, __m128i n, _Out_writes_bytes_(16) char *p); 282 extern void _mm_storel_epi64(__m128i_u *p, __m128i a); 283 extern void _mm_stream_pd(double *p, __m128d a); 284 extern void _mm_stream_si128(__m128i *p, __m128i a); 285 extern void _mm_stream_si32(int *p, int a); 286 extern void _mm_clflush(void const *p); 287 extern void _mm_lfence(void); 288 extern void _mm_mfence(void); 289 extern __m128i _mm_packs_epi16(__m128i a, __m128i b); 290 extern __m128i _mm_packs_epi32(__m128i a, __m128i b); 291 extern __m128i _mm_packus_epi16(__m128i a, __m128i b); 292 extern int _mm_extract_epi16(__m128i a, int imm); 293 extern __m128i _mm_insert_epi16(__m128i a, int b, int imm); 294 extern int _mm_movemask_epi8(__m128i a); 295 extern __m128i _mm_shuffle_epi32(__m128i a, int imm); 296 extern __m128i _mm_shufflelo_epi16(__m128i a, int imm); 297 extern __m128i _mm_shufflehi_epi16(__m128i a, int imm); 298 extern __m128i _mm_unpackhi_epi8(__m128i a, __m128i b); 299 extern __m128i _mm_unpackhi_epi16(__m128i a, __m128i b); 300 extern __m128i _mm_unpackhi_epi32(__m128i a, __m128i b); 301 extern __m128i _mm_unpackhi_epi64(__m128i a, __m128i b); 302 extern __m128i _mm_unpacklo_epi8(__m128i a, __m128i b); 303 extern __m128i _mm_unpacklo_epi16(__m128i a, __m128i b); 304 extern __m128i _mm_unpacklo_epi32(__m128i a, __m128i b); 305 extern __m128i _mm_unpacklo_epi64(__m128i a, __m128i b); 306 extern __m64 _mm_movepi64_pi64(__m128i a); 307 extern __m128i _mm_movpi64_epi64(__m64 a); 308 extern __m128i _mm_move_epi64(__m128i a); 309 extern __m128d _mm_unpackhi_pd(__m128d a, __m128d b); 310 extern __m128d _mm_unpacklo_pd(__m128d a, __m128d b); 311 extern int _mm_movemask_pd(__m128d a); 312 extern __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm); 313 extern __m128 _mm_castpd_ps(__m128d a); 314 extern __m128i _mm_castpd_si128(__m128d a); 315 extern __m128d _mm_castps_pd(__m128 a); 316 extern __m128i _mm_castps_si128(__m128 a); 317 extern __m128 _mm_castsi128_ps(__m128i a); 318 extern __m128d _mm_castsi128_pd(__m128i a); 319 void _mm_pause(void); 320 321 /* Alternate names */ 322 #define _mm_set_pd1(a) _mm_set1_pd(a) 323 #define _mm_load_pd1(p) _mm_load1_pd(p) 324 #define _mm_store_pd1(p, a) _mm_store1_pd((p), (a)) 325 #define _mm_bslli_si128 _mm_slli_si128 326 #define _mm_bsrli_si128 _mm_srli_si128 327 #define _mm_stream_si64 _mm_stream_si64x 328 329 #if defined(_MSC_VER) && !defined(__clang__) 330 331 #pragma intrinsic(_mm_add_sd) 332 #pragma intrinsic(_mm_add_pd) 333 #pragma intrinsic(_mm_sub_sd) 334 #pragma intrinsic(_mm_sub_pd) 335 #pragma intrinsic(_mm_mul_sd) 336 #pragma intrinsic(_mm_mul_pd) 337 #pragma intrinsic(_mm_div_sd) 338 #pragma intrinsic(_mm_div_pd) 339 #pragma intrinsic(_mm_sqrt_sd) 340 #pragma intrinsic(_mm_sqrt_pd) 341 #pragma intrinsic(_mm_min_sd) 342 #pragma intrinsic(_mm_min_pd) 343 #pragma intrinsic(_mm_max_sd) 344 #pragma intrinsic(_mm_max_pd) 345 #pragma intrinsic(_mm_and_pd) 346 #pragma intrinsic(_mm_andnot_pd) 347 #pragma intrinsic(_mm_or_pd) 348 #pragma intrinsic(_mm_xor_pd) 349 #pragma intrinsic(_mm_cmpeq_pd) 350 #pragma intrinsic(_mm_cmplt_pd) 351 #pragma intrinsic(_mm_cmple_pd) 352 #pragma intrinsic(_mm_cmpgt_pd) 353 #pragma intrinsic(_mm_cmpge_pd) 354 #pragma intrinsic(_mm_cmpord_pd) 355 #pragma intrinsic(_mm_cmpunord_pd) 356 #pragma intrinsic(_mm_cmpneq_pd) 357 #pragma intrinsic(_mm_cmpnlt_pd) 358 #pragma intrinsic(_mm_cmpnle_pd) 359 #pragma intrinsic(_mm_cmpngt_pd) 360 #pragma intrinsic(_mm_cmpnge_pd) 361 #pragma intrinsic(_mm_cmpeq_sd) 362 #pragma intrinsic(_mm_cmplt_sd) 363 #pragma intrinsic(_mm_cmple_sd) 364 #pragma intrinsic(_mm_cmpgt_sd) 365 #pragma intrinsic(_mm_cmpge_sd) 366 #pragma intrinsic(_mm_cmpord_sd) 367 #pragma intrinsic(_mm_cmpunord_sd) 368 #pragma intrinsic(_mm_cmpneq_sd) 369 #pragma intrinsic(_mm_cmpnlt_sd) 370 #pragma intrinsic(_mm_cmpnle_sd) 371 #pragma intrinsic(_mm_cmpngt_sd) 372 #pragma intrinsic(_mm_cmpnge_sd) 373 #pragma intrinsic(_mm_comieq_sd) 374 #pragma intrinsic(_mm_comilt_sd) 375 #pragma intrinsic(_mm_comile_sd) 376 #pragma intrinsic(_mm_comigt_sd) 377 #pragma intrinsic(_mm_comige_sd) 378 #pragma intrinsic(_mm_comineq_sd) 379 #pragma intrinsic(_mm_ucomieq_sd) 380 #pragma intrinsic(_mm_ucomilt_sd) 381 #pragma intrinsic(_mm_ucomile_sd) 382 #pragma intrinsic(_mm_ucomigt_sd) 383 #pragma intrinsic(_mm_ucomige_sd) 384 #pragma intrinsic(_mm_ucomineq_sd) 385 #pragma intrinsic(_mm_cvtpd_ps) 386 #pragma intrinsic(_mm_cvtps_pd) 387 #pragma intrinsic(_mm_cvtepi32_pd) 388 #pragma intrinsic(_mm_cvtpd_epi32) 389 #pragma intrinsic(_mm_cvtsd_si32) 390 #pragma intrinsic(_mm_cvtsd_ss) 391 #pragma intrinsic(_mm_cvtsi32_sd) 392 #pragma intrinsic(_mm_cvtss_sd) 393 #pragma intrinsic(_mm_cvttpd_epi32) 394 #pragma intrinsic(_mm_cvttsd_si32) 395 //#pragma intrinsic(_mm_cvtpd_pi32) 396 //#pragma intrinsic(_mm_cvttpd_pi32) 397 //#pragma intrinsic(_mm_cvtpi32_pd) 398 #pragma intrinsic(_mm_cvtsd_f64) 399 #pragma intrinsic(_mm_load_pd) 400 #pragma intrinsic(_mm_load1_pd) 401 #pragma intrinsic(_mm_loadr_pd) 402 #pragma intrinsic(_mm_loadu_pd) 403 //#pragma intrinsic(_mm_loadu_si64) 404 //#pragma intrinsic(_mm_loadu_si32) 405 //#pragma intrinsic(_mm_loadu_si16) 406 #pragma intrinsic(_mm_load_sd) 407 #pragma intrinsic(_mm_loadh_pd) 408 #pragma intrinsic(_mm_loadl_pd) 409 //#pragma intrinsic(_mm_undefined_pd) 410 #pragma intrinsic(_mm_set_sd) 411 #pragma intrinsic(_mm_set1_pd) 412 #pragma intrinsic(_mm_set_pd) 413 #pragma intrinsic(_mm_setr_pd) 414 #pragma intrinsic(_mm_setzero_pd) 415 #pragma intrinsic(_mm_move_sd) 416 #pragma intrinsic(_mm_store_sd) 417 #pragma intrinsic(_mm_store_pd) 418 #pragma intrinsic(_mm_store1_pd) 419 #pragma intrinsic(_mm_storeu_pd) 420 #pragma intrinsic(_mm_storer_pd) 421 #pragma intrinsic(_mm_storeh_pd) 422 #pragma intrinsic(_mm_storel_pd) 423 #pragma intrinsic(_mm_add_epi8) 424 #pragma intrinsic(_mm_add_epi16) 425 #pragma intrinsic(_mm_add_epi32) 426 //#pragma intrinsic(_mm_add_si64) 427 #pragma intrinsic(_mm_add_epi64) 428 #pragma intrinsic(_mm_adds_epi8) 429 #pragma intrinsic(_mm_adds_epi16) 430 #pragma intrinsic(_mm_adds_epu8) 431 #pragma intrinsic(_mm_adds_epu16) 432 #pragma intrinsic(_mm_avg_epu8) 433 #pragma intrinsic(_mm_avg_epu16) 434 #pragma intrinsic(_mm_madd_epi16) 435 #pragma intrinsic(_mm_max_epi16) 436 #pragma intrinsic(_mm_max_epu8) 437 #pragma intrinsic(_mm_min_epi16) 438 #pragma intrinsic(_mm_min_epu8) 439 #pragma intrinsic(_mm_mulhi_epi16) 440 #pragma intrinsic(_mm_mulhi_epu16) 441 #pragma intrinsic(_mm_mullo_epi16) 442 //#pragma intrinsic(_mm_mul_su32) 443 #pragma intrinsic(_mm_mul_epu32) 444 #pragma intrinsic(_mm_sad_epu8) 445 #pragma intrinsic(_mm_sub_epi8) 446 #pragma intrinsic(_mm_sub_epi16) 447 #pragma intrinsic(_mm_sub_epi32) 448 //#pragma intrinsic(_mm_sub_si64) 449 #pragma intrinsic(_mm_sub_epi64) 450 #pragma intrinsic(_mm_subs_epi8) 451 #pragma intrinsic(_mm_subs_epi16) 452 #pragma intrinsic(_mm_subs_epu8) 453 #pragma intrinsic(_mm_subs_epu16) 454 #pragma intrinsic(_mm_and_si128) 455 #pragma intrinsic(_mm_andnot_si128) 456 #pragma intrinsic(_mm_or_si128) 457 #pragma intrinsic(_mm_xor_si128) 458 #pragma intrinsic(_mm_slli_si128) 459 #pragma intrinsic(_mm_slli_epi16) 460 #pragma intrinsic(_mm_sll_epi16) 461 #pragma intrinsic(_mm_slli_epi32) 462 #pragma intrinsic(_mm_sll_epi32) 463 #pragma intrinsic(_mm_slli_epi64) 464 #pragma intrinsic(_mm_sll_epi64) 465 #pragma intrinsic(_mm_srai_epi16) 466 #pragma intrinsic(_mm_sra_epi16) 467 #pragma intrinsic(_mm_srai_epi32) 468 #pragma intrinsic(_mm_sra_epi32) 469 #pragma intrinsic(_mm_srli_si128) 470 #pragma intrinsic(_mm_srli_epi16) 471 #pragma intrinsic(_mm_srl_epi16) 472 #pragma intrinsic(_mm_srli_epi32) 473 #pragma intrinsic(_mm_srl_epi32) 474 #pragma intrinsic(_mm_srli_epi64) 475 #pragma intrinsic(_mm_srl_epi64) 476 #pragma intrinsic(_mm_cmpeq_epi8) 477 #pragma intrinsic(_mm_cmpeq_epi16) 478 #pragma intrinsic(_mm_cmpeq_epi32) 479 #pragma intrinsic(_mm_cmpgt_epi8) 480 #pragma intrinsic(_mm_cmpgt_epi16) 481 #pragma intrinsic(_mm_cmpgt_epi32) 482 #pragma intrinsic(_mm_cmplt_epi8) 483 #pragma intrinsic(_mm_cmplt_epi16) 484 #pragma intrinsic(_mm_cmplt_epi32) 485 #ifdef _M_AMD64 486 #pragma intrinsic(_mm_cvtsi64_sd) 487 #pragma intrinsic(_mm_cvtsd_si64) 488 #pragma intrinsic(_mm_cvttsd_si64) 489 #endif 490 #pragma intrinsic(_mm_cvtepi32_ps) 491 #pragma intrinsic(_mm_cvtps_epi32) 492 #pragma intrinsic(_mm_cvttps_epi32) 493 #pragma intrinsic(_mm_cvtsi32_si128) 494 #ifdef _M_AMD64 495 #pragma intrinsic(_mm_cvtsi64_si128) 496 #endif 497 #pragma intrinsic(_mm_cvtsi128_si32) 498 #ifdef _M_AMD64 499 #pragma intrinsic(_mm_cvtsi128_si64) 500 #endif 501 #pragma intrinsic(_mm_load_si128) 502 #pragma intrinsic(_mm_loadu_si128) 503 #pragma intrinsic(_mm_loadl_epi64) 504 //#pragma intrinsic(_mm_undefined_si128) 505 //#pragma intrinsic(_mm_set_epi64x) 506 //#pragma intrinsic(_mm_set_epi64) 507 #pragma intrinsic(_mm_set_epi32) 508 #pragma intrinsic(_mm_set_epi16) 509 #pragma intrinsic(_mm_set_epi8) 510 //#pragma intrinsic(_mm_set1_epi64x) 511 //#pragma intrinsic(_mm_set1_epi64) 512 #pragma intrinsic(_mm_set1_epi32) 513 #pragma intrinsic(_mm_set1_epi16) 514 #pragma intrinsic(_mm_set1_epi8) 515 #pragma intrinsic(_mm_setl_epi64) 516 //#pragma intrinsic(_mm_setr_epi64) 517 #pragma intrinsic(_mm_setr_epi32) 518 #pragma intrinsic(_mm_setr_epi16) 519 #pragma intrinsic(_mm_setr_epi8) 520 #pragma intrinsic(_mm_setzero_si128) 521 #pragma intrinsic(_mm_store_si128) 522 #pragma intrinsic(_mm_storeu_si128) 523 //#pragma intrinsic(_mm_storeu_si64) 524 //#pragma intrinsic(_mm_storeu_si32) 525 //#pragma intrinsic(_mm_storeu_si16) 526 #pragma intrinsic(_mm_maskmoveu_si128) 527 #pragma intrinsic(_mm_storel_epi64) 528 #pragma intrinsic(_mm_stream_pd) 529 #pragma intrinsic(_mm_stream_si128) 530 #pragma intrinsic(_mm_stream_si32) 531 #pragma intrinsic(_mm_clflush) 532 #pragma intrinsic(_mm_lfence) 533 #pragma intrinsic(_mm_mfence) 534 #pragma intrinsic(_mm_packs_epi16) 535 #pragma intrinsic(_mm_packs_epi32) 536 #pragma intrinsic(_mm_packus_epi16) 537 #pragma intrinsic(_mm_extract_epi16) 538 #pragma intrinsic(_mm_insert_epi16) 539 #pragma intrinsic(_mm_movemask_epi8) 540 #pragma intrinsic(_mm_shuffle_epi32) 541 #pragma intrinsic(_mm_shufflelo_epi16) 542 #pragma intrinsic(_mm_shufflehi_epi16) 543 #pragma intrinsic(_mm_unpackhi_epi8) 544 #pragma intrinsic(_mm_unpackhi_epi16) 545 #pragma intrinsic(_mm_unpackhi_epi32) 546 #pragma intrinsic(_mm_unpackhi_epi64) 547 #pragma intrinsic(_mm_unpacklo_epi8) 548 #pragma intrinsic(_mm_unpacklo_epi16) 549 #pragma intrinsic(_mm_unpacklo_epi32) 550 #pragma intrinsic(_mm_unpacklo_epi64) 551 //#pragma intrinsic(_mm_movepi64_pi64) 552 //#pragma intrinsic(_mm_movpi64_epi64) 553 #pragma intrinsic(_mm_move_epi64) 554 #pragma intrinsic(_mm_unpackhi_pd) 555 #pragma intrinsic(_mm_unpacklo_pd) 556 #pragma intrinsic(_mm_movemask_pd) 557 #pragma intrinsic(_mm_shuffle_pd) 558 #pragma intrinsic(_mm_castpd_ps) 559 #pragma intrinsic(_mm_castpd_si128) 560 #pragma intrinsic(_mm_castps_pd) 561 #pragma intrinsic(_mm_castps_si128) 562 #pragma intrinsic(_mm_castsi128_ps) 563 #pragma intrinsic(_mm_castsi128_pd) 564 #pragma intrinsic(_mm_pause) 565 566 #else /* _MSC_VER */ 567 568 /* 569 Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/emmintrin.h 570 Clang older version: https://github.com/llvm/llvm-project/blob/3ef88b31843e040c95f23ff2c3c206f1fa399c05/clang/lib/Headers/emmintrin.h 571 unikraft: https://github.com/unikraft/lib-intel-intrinsics/blob/staging/include/emmintrin.h 572 */ 573 574 __INTRIN_INLINE_SSE2 __m128d _mm_add_sd(__m128d a, __m128d b) 575 { 576 a[0] += b[0]; 577 return a; 578 } 579 580 __INTRIN_INLINE_SSE2 __m128d _mm_add_pd(__m128d a, __m128d b) 581 { 582 return (__m128d)((__v2df)a + (__v2df)b); 583 } 584 585 __INTRIN_INLINE_SSE2 __m128d _mm_sub_sd(__m128d a, __m128d b) 586 { 587 a[0] -= b[0]; 588 return a; 589 } 590 591 __INTRIN_INLINE_SSE2 __m128d _mm_sub_pd(__m128d a, __m128d b) 592 { 593 return (__m128d)((__v2df)a - (__v2df)b); 594 } 595 596 __INTRIN_INLINE_SSE2 __m128d _mm_mul_sd(__m128d a, __m128d b) 597 { 598 a[0] *= b[0]; 599 return a; 600 } 601 602 __INTRIN_INLINE_SSE2 __m128d _mm_mul_pd(__m128d a, __m128d b) 603 { 604 return (__m128d)((__v2df)a * (__v2df)b); 605 } 606 607 __INTRIN_INLINE_SSE2 __m128d _mm_div_sd(__m128d a, __m128d b) 608 { 609 a[0] /= b[0]; 610 return a; 611 } 612 613 __INTRIN_INLINE_SSE2 __m128d _mm_div_pd(__m128d a, __m128d b) 614 { 615 return (__m128d)((__v2df)a / (__v2df)b); 616 } 617 618 __INTRIN_INLINE_SSE2 __m128d _mm_sqrt_sd(__m128d a, __m128d b) 619 { 620 __m128d __c = __builtin_ia32_sqrtsd((__v2df)b); 621 return __extension__(__m128d){__c[0], a[1]}; 622 } 623 624 __INTRIN_INLINE_SSE2 __m128d _mm_sqrt_pd(__m128d a) 625 { 626 return __builtin_ia32_sqrtpd((__v2df)a); 627 } 628 629 __INTRIN_INLINE_SSE2 __m128d _mm_min_sd(__m128d a, __m128d b) 630 { 631 return __builtin_ia32_minsd((__v2df)a, (__v2df)b); 632 } 633 634 __INTRIN_INLINE_SSE2 __m128d _mm_min_pd(__m128d a, __m128d b) 635 { 636 return __builtin_ia32_minpd((__v2df)a, (__v2df)b); 637 } 638 639 __INTRIN_INLINE_SSE2 __m128d _mm_max_sd(__m128d a, __m128d b) 640 { 641 return __builtin_ia32_maxsd((__v2df)a, (__v2df)b); 642 } 643 644 __INTRIN_INLINE_SSE2 __m128d _mm_max_pd(__m128d a, __m128d b) 645 { 646 return __builtin_ia32_maxpd((__v2df)a, (__v2df)b); 647 } 648 649 __INTRIN_INLINE_SSE2 __m128d _mm_and_pd(__m128d a, __m128d b) 650 { 651 return (__m128d)((__v2du)a & (__v2du)b); 652 } 653 654 __INTRIN_INLINE_SSE2 __m128d _mm_andnot_pd(__m128d a, __m128d b) 655 { 656 return (__m128d)(~(__v2du)a & (__v2du)b); 657 } 658 659 __INTRIN_INLINE_SSE2 __m128d _mm_or_pd(__m128d a, __m128d b) 660 { 661 return (__m128d)((__v2du)a | (__v2du)b); 662 } 663 664 __INTRIN_INLINE_SSE2 __m128d _mm_xor_pd(__m128d a, __m128d b) 665 { 666 return (__m128d)((__v2du)a ^ (__v2du)b); 667 } 668 669 __INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_pd(__m128d a, __m128d b) 670 { 671 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)a, (__v2df)b); 672 } 673 674 __INTRIN_INLINE_SSE2 __m128d _mm_cmplt_pd(__m128d a, __m128d b) 675 { 676 return (__m128d)__builtin_ia32_cmpltpd((__v2df)a, (__v2df)b); 677 } 678 679 __INTRIN_INLINE_SSE2 __m128d _mm_cmple_pd(__m128d a, __m128d b) 680 { 681 return (__m128d)__builtin_ia32_cmplepd((__v2df)a, (__v2df)b); 682 } 683 684 __INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_pd(__m128d a, __m128d b) 685 { 686 return (__m128d)__builtin_ia32_cmpltpd((__v2df)b, (__v2df)a); 687 } 688 689 __INTRIN_INLINE_SSE2 __m128d _mm_cmpge_pd(__m128d a, __m128d b) 690 { 691 return (__m128d)__builtin_ia32_cmplepd((__v2df)b, (__v2df)a); 692 } 693 694 __INTRIN_INLINE_SSE2 __m128d _mm_cmpord_pd(__m128d a, __m128d b) 695 { 696 return (__m128d)__builtin_ia32_cmpordpd((__v2df)a, (__v2df)b); 697 } 698 699 __INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_pd(__m128d a, __m128d b) 700 { 701 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)a, (__v2df)b); 702 } 703 704 __INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_pd(__m128d a, __m128d b) 705 { 706 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)a, (__v2df)b); 707 } 708 709 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) 710 { 711 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)a, (__v2df)b); 712 } 713 714 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_pd(__m128d a, __m128d b) 715 { 716 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)a, (__v2df)b); 717 } 718 719 __INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_pd(__m128d a, __m128d b) 720 { 721 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)b, (__v2df)a); 722 } 723 724 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_pd(__m128d a, __m128d b) 725 { 726 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)b, (__v2df)a); 727 } 728 729 __INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_sd(__m128d a, __m128d b) 730 { 731 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)a, (__v2df)b); 732 } 733 734 __INTRIN_INLINE_SSE2 __m128d _mm_cmplt_sd(__m128d a, __m128d b) 735 { 736 return (__m128d)__builtin_ia32_cmpltsd((__v2df)a, (__v2df)b); 737 } 738 739 __INTRIN_INLINE_SSE2 __m128d _mm_cmple_sd(__m128d a, __m128d b) 740 { 741 return (__m128d)__builtin_ia32_cmplesd((__v2df)a, (__v2df)b); 742 } 743 744 __INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_sd(__m128d a, __m128d b) 745 { 746 __m128d __c = __builtin_ia32_cmpltsd((__v2df)b, (__v2df)a); 747 return __extension__(__m128d){__c[0], a[1]}; 748 } 749 750 __INTRIN_INLINE_SSE2 __m128d _mm_cmpge_sd(__m128d a, __m128d b) 751 { 752 __m128d __c = __builtin_ia32_cmplesd((__v2df)b, (__v2df)a); 753 return __extension__(__m128d){__c[0], a[1]}; 754 } 755 756 __INTRIN_INLINE_SSE2 __m128d _mm_cmpord_sd(__m128d a, __m128d b) 757 { 758 return (__m128d)__builtin_ia32_cmpordsd((__v2df)a, (__v2df)b); 759 } 760 761 __INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_sd(__m128d a, __m128d b) 762 { 763 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)a, (__v2df)b); 764 } 765 766 __INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_sd(__m128d a, __m128d b) 767 { 768 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)a, (__v2df)b); 769 } 770 771 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) 772 { 773 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)a, (__v2df)b); 774 } 775 776 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_sd(__m128d a, __m128d b) 777 { 778 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)a, (__v2df)b); 779 } 780 781 __INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_sd(__m128d a, __m128d b) 782 { 783 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)b, (__v2df)a); 784 return __extension__(__m128d){__c[0], a[1]}; 785 } 786 787 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_sd(__m128d a, __m128d b) 788 { 789 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)b, (__v2df)a); 790 return __extension__(__m128d){__c[0], a[1]}; 791 } 792 793 __INTRIN_INLINE_SSE2 int _mm_comieq_sd(__m128d a, __m128d b) 794 { 795 return __builtin_ia32_comisdeq((__v2df)a, (__v2df)b); 796 } 797 798 __INTRIN_INLINE_SSE2 int _mm_comilt_sd(__m128d a, __m128d b) 799 { 800 return __builtin_ia32_comisdlt((__v2df)a, (__v2df)b); 801 } 802 803 __INTRIN_INLINE_SSE2 int _mm_comile_sd(__m128d a, __m128d b) 804 { 805 return __builtin_ia32_comisdle((__v2df)a, (__v2df)b); 806 } 807 808 __INTRIN_INLINE_SSE2 int _mm_comigt_sd(__m128d a, __m128d b) 809 { 810 return __builtin_ia32_comisdgt((__v2df)a, (__v2df)b); 811 } 812 813 __INTRIN_INLINE_SSE2 int _mm_comige_sd(__m128d a, __m128d b) 814 { 815 return __builtin_ia32_comisdge((__v2df)a, (__v2df)b); 816 } 817 818 __INTRIN_INLINE_SSE2 int _mm_comineq_sd(__m128d a, __m128d b) 819 { 820 return __builtin_ia32_comisdneq((__v2df)a, (__v2df)b); 821 } 822 823 __INTRIN_INLINE_SSE2 int _mm_ucomieq_sd(__m128d a, __m128d b) 824 { 825 return __builtin_ia32_ucomisdeq((__v2df)a, (__v2df)b); 826 } 827 828 __INTRIN_INLINE_SSE2 int _mm_ucomilt_sd(__m128d a, __m128d b) 829 { 830 return __builtin_ia32_ucomisdlt((__v2df)a, (__v2df)b); 831 } 832 833 __INTRIN_INLINE_SSE2 int _mm_ucomile_sd(__m128d a, __m128d b) 834 { 835 return __builtin_ia32_ucomisdle((__v2df)a, (__v2df)b); 836 } 837 838 __INTRIN_INLINE_SSE2 int _mm_ucomigt_sd(__m128d a, __m128d b) 839 { 840 return __builtin_ia32_ucomisdgt((__v2df)a, (__v2df)b); 841 } 842 843 __INTRIN_INLINE_SSE2 int _mm_ucomige_sd(__m128d a, __m128d b) 844 { 845 return __builtin_ia32_ucomisdge((__v2df)a, (__v2df)b); 846 } 847 848 __INTRIN_INLINE_SSE2 int _mm_ucomineq_sd(__m128d a, __m128d b) 849 { 850 return __builtin_ia32_ucomisdneq((__v2df)a, (__v2df)b); 851 } 852 853 __INTRIN_INLINE_SSE2 __m128 _mm_cvtpd_ps(__m128d a) 854 { 855 return __builtin_ia32_cvtpd2ps((__v2df)a); 856 } 857 858 __INTRIN_INLINE_SSE2 __m128d _mm_cvtps_pd(__m128 a) 859 { 860 #if HAS_BUILTIN(__builtin_convertvector) 861 return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4sf)a, (__v4sf)a, 0, 1), __v2df); 862 #else 863 return __builtin_ia32_cvtps2pd(a); 864 #endif 865 } 866 867 __INTRIN_INLINE_SSE2 __m128d _mm_cvtepi32_pd(__m128i a) 868 { 869 #if HAS_BUILTIN(__builtin_convertvector) 870 return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4si)a, (__v4si)a, 0, 1), __v2df); 871 #else 872 return __builtin_ia32_cvtdq2pd((__v4si)a); 873 #endif 874 } 875 876 __INTRIN_INLINE_SSE2 __m128i _mm_cvtpd_epi32(__m128d a) 877 { 878 return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)a); 879 } 880 881 __INTRIN_INLINE_SSE2 int _mm_cvtsd_si32(__m128d a) 882 { 883 return __builtin_ia32_cvtsd2si((__v2df)a); 884 } 885 886 __INTRIN_INLINE_SSE2 __m128 _mm_cvtsd_ss(__m128 a, __m128d b) 887 { 888 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)a, (__v2df)b); 889 } 890 891 __INTRIN_INLINE_SSE2 __m128d _mm_cvtsi32_sd(__m128d a, 892 int b) 893 { 894 a[0] = b; 895 return a; 896 } 897 898 __INTRIN_INLINE_SSE2 __m128d _mm_cvtss_sd(__m128d a, __m128 b) 899 { 900 a[0] = b[0]; 901 return a; 902 } 903 904 __INTRIN_INLINE_SSE2 __m128i _mm_cvttpd_epi32(__m128d a) 905 { 906 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)a); 907 } 908 909 __INTRIN_INLINE_SSE2 int _mm_cvttsd_si32(__m128d a) 910 { 911 return __builtin_ia32_cvttsd2si((__v2df)a); 912 } 913 914 __INTRIN_INLINE_MMXSSE2 __m64 _mm_cvtpd_pi32(__m128d a) 915 { 916 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)a); 917 } 918 919 __INTRIN_INLINE_MMXSSE2 __m64 _mm_cvttpd_pi32(__m128d a) 920 { 921 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)a); 922 } 923 924 __INTRIN_INLINE_MMXSSE2 __m128d _mm_cvtpi32_pd(__m64 a) 925 { 926 return __builtin_ia32_cvtpi2pd((__v2si)a); 927 } 928 929 __INTRIN_INLINE_SSE2 double _mm_cvtsd_f64(__m128d a) 930 { 931 return a[0]; 932 } 933 934 __INTRIN_INLINE_SSE2 __m128d _mm_load_pd(double const *dp) 935 { 936 return *(const __m128d *)dp; 937 } 938 939 __INTRIN_INLINE_SSE2 __m128d _mm_load1_pd(double const *dp) 940 { 941 struct __mm_load1_pd_struct { 942 double __u; 943 } __attribute__((__packed__, __may_alias__)); 944 double __u = ((const struct __mm_load1_pd_struct *)dp)->__u; 945 return __extension__(__m128d){__u, __u}; 946 } 947 948 // GCC: 949 /* Create a selector for use with the SHUFPD instruction. */ 950 #define _MM_SHUFFLE2(fp1,fp0) \ 951 (((fp1) << 1) | (fp0)) 952 953 __INTRIN_INLINE_SSE2 __m128d _mm_loadr_pd(double const *dp) 954 { 955 #if HAS_BUILTIN(__builtin_shufflevector) 956 __m128d u = *(const __m128d *)dp; 957 return __builtin_shufflevector((__v2df)u, (__v2df)u, 1, 0); 958 #else 959 return (__m128d){ dp[1], dp[0] }; 960 #endif 961 } 962 963 __INTRIN_INLINE_SSE2 __m128d _mm_loadu_pd(double const *dp) 964 { 965 struct __loadu_pd { 966 __m128d_u __v; 967 } __attribute__((__packed__, __may_alias__)); 968 return ((const struct __loadu_pd *)dp)->__v; 969 } 970 971 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si64(void const *a) 972 { 973 struct __loadu_si64 { 974 long long __v; 975 } __attribute__((__packed__, __may_alias__)); 976 long long __u = ((const struct __loadu_si64 *)a)->__v; 977 return __extension__(__m128i)(__v2di){__u, 0LL}; 978 } 979 980 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si32(void const *a) 981 { 982 struct __loadu_si32 { 983 int __v; 984 } __attribute__((__packed__, __may_alias__)); 985 int __u = ((const struct __loadu_si32 *)a)->__v; 986 return __extension__(__m128i)(__v4si){__u, 0, 0, 0}; 987 } 988 989 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si16(void const *a) 990 { 991 struct __loadu_si16 { 992 short __v; 993 } __attribute__((__packed__, __may_alias__)); 994 short __u = ((const struct __loadu_si16 *)a)->__v; 995 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; 996 } 997 998 __INTRIN_INLINE_SSE2 __m128d _mm_load_sd(double const *dp) 999 { 1000 struct __mm_load_sd_struct { 1001 double __u; 1002 } __attribute__((__packed__, __may_alias__)); 1003 double __u = ((const struct __mm_load_sd_struct *)dp)->__u; 1004 return __extension__(__m128d){__u, 0}; 1005 } 1006 1007 __INTRIN_INLINE_SSE2 __m128d _mm_loadh_pd(__m128d a, double const *dp) 1008 { 1009 struct __mm_loadh_pd_struct { 1010 double __u; 1011 } __attribute__((__packed__, __may_alias__)); 1012 double __u = ((const struct __mm_loadh_pd_struct *)dp)->__u; 1013 return __extension__(__m128d){a[0], __u}; 1014 } 1015 1016 __INTRIN_INLINE_SSE2 __m128d _mm_loadl_pd(__m128d a, double const *dp) 1017 { 1018 struct __mm_loadl_pd_struct { 1019 double __u; 1020 } __attribute__((__packed__, __may_alias__)); 1021 double __u = ((const struct __mm_loadl_pd_struct *)dp)->__u; 1022 return __extension__(__m128d){__u, a[1]}; 1023 } 1024 1025 __INTRIN_INLINE_SSE2 __m128d _mm_undefined_pd(void) 1026 { 1027 #if HAS_BUILTIN(__builtin_ia32_undef128) 1028 return (__m128d)__builtin_ia32_undef128(); 1029 #else 1030 __m128d undef = undef; 1031 return undef; 1032 #endif 1033 } 1034 1035 __INTRIN_INLINE_SSE2 __m128d _mm_set_sd(double w) 1036 { 1037 return __extension__(__m128d){w, 0}; 1038 } 1039 1040 __INTRIN_INLINE_SSE2 __m128d _mm_set1_pd(double w) 1041 { 1042 return __extension__(__m128d){w, w}; 1043 } 1044 1045 __INTRIN_INLINE_SSE2 __m128d _mm_set_pd(double w, double x) 1046 { 1047 return __extension__(__m128d){x, w}; 1048 } 1049 1050 __INTRIN_INLINE_SSE2 __m128d _mm_setr_pd(double w, double x) 1051 { 1052 return __extension__(__m128d){w, x}; 1053 } 1054 1055 __INTRIN_INLINE_SSE2 __m128d _mm_setzero_pd(void) 1056 { 1057 return __extension__(__m128d){0, 0}; 1058 } 1059 1060 __INTRIN_INLINE_SSE2 __m128d _mm_move_sd(__m128d a, __m128d b) 1061 { 1062 a[0] = b[0]; 1063 return a; 1064 } 1065 1066 __INTRIN_INLINE_SSE2 void _mm_store_sd(double *dp, __m128d a) 1067 { 1068 struct __mm_store_sd_struct { 1069 double __u; 1070 } __attribute__((__packed__, __may_alias__)); 1071 ((struct __mm_store_sd_struct *)dp)->__u = a[0]; 1072 } 1073 1074 __INTRIN_INLINE_SSE2 void _mm_store_pd(double *dp, __m128d a) 1075 { 1076 *(__m128d *)dp = a; 1077 } 1078 1079 __INTRIN_INLINE_SSE2 void _mm_store1_pd(double *dp, __m128d a) 1080 { 1081 #if HAS_BUILTIN(__builtin_shufflevector) 1082 a = __builtin_shufflevector((__v2df)a, (__v2df)a, 0, 0); 1083 _mm_store_pd(dp, a); 1084 #else 1085 dp[0] = a[0]; 1086 dp[1] = a[0]; 1087 #endif 1088 } 1089 1090 __INTRIN_INLINE_SSE2 void _mm_storeu_pd(double *dp, __m128d a) 1091 { 1092 struct __storeu_pd { 1093 __m128d_u __v; 1094 } __attribute__((__packed__, __may_alias__)); 1095 ((struct __storeu_pd *)dp)->__v = a; 1096 } 1097 1098 __INTRIN_INLINE_SSE2 void _mm_storer_pd(double *dp, __m128d a) 1099 { 1100 #if HAS_BUILTIN(__builtin_shufflevector) 1101 a = __builtin_shufflevector((__v2df)a, (__v2df)a, 1, 0); 1102 *(__m128d *)dp = a; 1103 #else 1104 dp[0] = a[1]; 1105 dp[1] = a[0]; 1106 #endif 1107 } 1108 1109 __INTRIN_INLINE_SSE2 void _mm_storeh_pd(double *dp, __m128d a) 1110 { 1111 struct __mm_storeh_pd_struct { 1112 double __u; 1113 } __attribute__((__packed__, __may_alias__)); 1114 ((struct __mm_storeh_pd_struct *)dp)->__u = a[1]; 1115 } 1116 1117 __INTRIN_INLINE_SSE2 void _mm_storel_pd(double *dp, __m128d a) 1118 { 1119 struct __mm_storeh_pd_struct { 1120 double __u; 1121 } __attribute__((__packed__, __may_alias__)); 1122 ((struct __mm_storeh_pd_struct *)dp)->__u = a[0]; 1123 } 1124 1125 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi8(__m128i a, __m128i b) 1126 { 1127 return (__m128i)((__v16qu)a + (__v16qu)b); 1128 } 1129 1130 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi16(__m128i a, __m128i b) 1131 { 1132 return (__m128i)((__v8hu)a + (__v8hu)b); 1133 } 1134 1135 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi32(__m128i a, __m128i b) 1136 { 1137 return (__m128i)((__v4su)a + (__v4su)b); 1138 } 1139 1140 __INTRIN_INLINE_MMXSSE2 __m64 _mm_add_si64(__m64 a, __m64 b) 1141 { 1142 return (__m64)__builtin_ia32_paddq((__v1di)a, (__v1di)b); 1143 } 1144 1145 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi64(__m128i a, __m128i b) 1146 { 1147 return (__m128i)((__v2du)a + (__v2du)b); 1148 } 1149 1150 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epi8(__m128i a, __m128i b) 1151 { 1152 #if HAS_BUILTIN(__builtin_elementwise_add_sat) 1153 return (__m128i)__builtin_elementwise_add_sat((__v16qs)a, (__v16qs)b); 1154 #else 1155 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 1156 #endif 1157 } 1158 1159 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epi16(__m128i a, __m128i b) 1160 { 1161 #if HAS_BUILTIN(__builtin_elementwise_add_sat) 1162 return (__m128i)__builtin_elementwise_add_sat((__v8hi)a, (__v8hi)b); 1163 #else 1164 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 1165 #endif 1166 } 1167 1168 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epu8(__m128i a, __m128i b) 1169 { 1170 #if HAS_BUILTIN(__builtin_elementwise_add_sat) 1171 return (__m128i)__builtin_elementwise_add_sat((__v16qu)a, (__v16qu)b); 1172 #else 1173 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 1174 #endif 1175 } 1176 1177 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epu16(__m128i a, __m128i b) 1178 { 1179 #if HAS_BUILTIN(__builtin_elementwise_add_sat) 1180 return (__m128i)__builtin_elementwise_add_sat((__v8hu)a, (__v8hu)b); 1181 #else 1182 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 1183 #endif 1184 } 1185 1186 __INTRIN_INLINE_SSE2 __m128i _mm_avg_epu8(__m128i a, __m128i b) 1187 { 1188 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 1189 } 1190 1191 __INTRIN_INLINE_SSE2 __m128i _mm_avg_epu16(__m128i a, __m128i b) 1192 { 1193 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 1194 } 1195 1196 __INTRIN_INLINE_SSE2 __m128i _mm_madd_epi16(__m128i a, __m128i b) 1197 { 1198 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 1199 } 1200 1201 __INTRIN_INLINE_SSE2 __m128i _mm_max_epi16(__m128i a, __m128i b) 1202 { 1203 #if HAS_BUILTIN(__builtin_elementwise_max) 1204 return (__m128i)__builtin_elementwise_max((__v8hi)a, (__v8hi)b); 1205 #else 1206 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 1207 #endif 1208 } 1209 1210 __INTRIN_INLINE_SSE2 __m128i _mm_max_epu8(__m128i a, __m128i b) 1211 { 1212 #if HAS_BUILTIN(__builtin_elementwise_max) 1213 return (__m128i)__builtin_elementwise_max((__v16qu)a, (__v16qu)b); 1214 #else 1215 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 1216 #endif 1217 } 1218 1219 __INTRIN_INLINE_SSE2 __m128i _mm_min_epi16(__m128i a, __m128i b) 1220 { 1221 #if HAS_BUILTIN(__builtin_elementwise_min) 1222 return (__m128i)__builtin_elementwise_min((__v8hi)a, (__v8hi)b); 1223 #else 1224 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 1225 #endif 1226 } 1227 1228 __INTRIN_INLINE_SSE2 __m128i _mm_min_epu8(__m128i a, __m128i b) 1229 { 1230 #if HAS_BUILTIN(__builtin_elementwise_min) 1231 return (__m128i)__builtin_elementwise_min((__v16qu)a, (__v16qu)b); 1232 #else 1233 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 1234 #endif 1235 } 1236 1237 __INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epi16(__m128i a, __m128i b) 1238 { 1239 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 1240 } 1241 1242 __INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epu16(__m128i a, __m128i b) 1243 { 1244 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 1245 } 1246 1247 __INTRIN_INLINE_SSE2 __m128i _mm_mullo_epi16(__m128i a, __m128i b) 1248 { 1249 return (__m128i)((__v8hu)a * (__v8hu)b); 1250 } 1251 1252 __INTRIN_INLINE_MMXSSE2 __m64 _mm_mul_su32(__m64 a, __m64 b) 1253 { 1254 return (__m64)__builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 1255 } 1256 1257 __INTRIN_INLINE_SSE2 __m128i _mm_mul_epu32(__m128i a, __m128i b) 1258 { 1259 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 1260 } 1261 1262 __INTRIN_INLINE_SSE2 __m128i _mm_sad_epu8(__m128i a, __m128i b) 1263 { 1264 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 1265 } 1266 1267 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi8(__m128i a, __m128i b) 1268 { 1269 return (__m128i)((__v16qu)a - (__v16qu)b); 1270 } 1271 1272 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi16(__m128i a, __m128i b) 1273 { 1274 return (__m128i)((__v8hu)a - (__v8hu)b); 1275 } 1276 1277 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi32(__m128i a, __m128i b) 1278 { 1279 return (__m128i)((__v4su)a - (__v4su)b); 1280 } 1281 1282 __INTRIN_INLINE_MMXSSE2 __m64 _mm_sub_si64(__m64 a, __m64 b) 1283 { 1284 return (__m64)__builtin_ia32_psubq((__v1di)a, (__v1di)b); 1285 } 1286 1287 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi64(__m128i a, __m128i b) 1288 { 1289 return (__m128i)((__v2du)a - (__v2du)b); 1290 } 1291 1292 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epi8(__m128i a, __m128i b) 1293 { 1294 #if HAS_BUILTIN(__builtin_elementwise_sub_sat) 1295 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)a, (__v16qs)b); 1296 #else 1297 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 1298 #endif 1299 } 1300 1301 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epi16(__m128i a, __m128i b) 1302 { 1303 #if HAS_BUILTIN(__builtin_elementwise_sub_sat) 1304 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)a, (__v8hi)b); 1305 #else 1306 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 1307 #endif 1308 } 1309 1310 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epu8(__m128i a, __m128i b) 1311 { 1312 #if HAS_BUILTIN(__builtin_elementwise_sub_sat) 1313 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)a, (__v16qu)b); 1314 #else 1315 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 1316 #endif 1317 } 1318 1319 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epu16(__m128i a, __m128i b) 1320 { 1321 #if HAS_BUILTIN(__builtin_elementwise_sub_sat) 1322 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)a, (__v8hu)b); 1323 #else 1324 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 1325 #endif 1326 } 1327 1328 __INTRIN_INLINE_SSE2 __m128i _mm_and_si128(__m128i a, __m128i b) 1329 { 1330 return (__m128i)((__v2du)a & (__v2du)b); 1331 } 1332 1333 __INTRIN_INLINE_SSE2 __m128i _mm_andnot_si128(__m128i a, __m128i b) 1334 { 1335 return (__m128i)(~(__v2du)a & (__v2du)b); 1336 } 1337 1338 __INTRIN_INLINE_SSE2 __m128i _mm_or_si128(__m128i a, __m128i b) 1339 { 1340 return (__m128i)((__v2du)a | (__v2du)b); 1341 } 1342 1343 __INTRIN_INLINE_SSE2 __m128i _mm_xor_si128(__m128i a, __m128i b) 1344 { 1345 return (__m128i)((__v2du)a ^ (__v2du)b); 1346 } 1347 1348 #ifdef __clang__ 1349 #define _mm_slli_si128(a, imm) \ 1350 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) 1351 #else 1352 __INTRIN_INLINE_SSE2 __m128i _mm_slli_si128(__m128i a, const int imm) 1353 { 1354 return (__m128i)__builtin_ia32_pslldqi128(a, imm * 8); 1355 } 1356 #endif 1357 1358 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi16(__m128i a, int count) 1359 { 1360 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 1361 } 1362 1363 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi16(__m128i a, __m128i count) 1364 { 1365 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 1366 } 1367 1368 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi32(__m128i a, int count) 1369 { 1370 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 1371 } 1372 1373 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi32(__m128i a, __m128i count) 1374 { 1375 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 1376 } 1377 1378 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi64(__m128i a, int count) 1379 { 1380 return __builtin_ia32_psllqi128((__v2di)a, count); 1381 } 1382 1383 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi64(__m128i a, __m128i count) 1384 { 1385 return __builtin_ia32_psllq128((__v2di)a, (__v2di)count); 1386 } 1387 1388 __INTRIN_INLINE_SSE2 __m128i _mm_srai_epi16(__m128i a, int count) 1389 { 1390 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 1391 } 1392 1393 __INTRIN_INLINE_SSE2 __m128i _mm_sra_epi16(__m128i a, __m128i count) 1394 { 1395 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 1396 } 1397 1398 __INTRIN_INLINE_SSE2 __m128i _mm_srai_epi32(__m128i a, int count) 1399 { 1400 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 1401 } 1402 1403 __INTRIN_INLINE_SSE2 __m128i _mm_sra_epi32(__m128i a, __m128i count) 1404 { 1405 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 1406 } 1407 1408 #ifdef __clang__ 1409 #define _mm_srli_si128(a, imm) \ 1410 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) 1411 #else 1412 __INTRIN_INLINE_SSE2 __m128i _mm_srli_si128(__m128i a, const int imm) 1413 { 1414 return (__m128i)__builtin_ia32_psrldqi128(a, imm * 8); 1415 } 1416 #endif 1417 1418 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi16(__m128i a, int count) 1419 { 1420 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 1421 } 1422 1423 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi16(__m128i a, __m128i count) 1424 { 1425 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 1426 } 1427 1428 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi32(__m128i a, int count) 1429 { 1430 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 1431 } 1432 1433 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi32(__m128i a, __m128i count) 1434 { 1435 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 1436 } 1437 1438 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi64(__m128i a, int count) 1439 { 1440 return __builtin_ia32_psrlqi128((__v2di)a, count); 1441 } 1442 1443 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi64(__m128i a, __m128i count) 1444 { 1445 return __builtin_ia32_psrlq128((__v2di)a, (__v2di)count); 1446 } 1447 1448 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) 1449 { 1450 return (__m128i)((__v16qi)a == (__v16qi)b); 1451 } 1452 1453 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) 1454 { 1455 return (__m128i)((__v8hi)a == (__v8hi)b); 1456 } 1457 1458 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) 1459 { 1460 return (__m128i)((__v4si)a == (__v4si)b); 1461 } 1462 1463 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) 1464 { 1465 /* This function always performs a signed comparison, but __v16qi is a char 1466 which may be signed or unsigned, so use __v16qs. */ 1467 return (__m128i)((__v16qs)a > (__v16qs)b); 1468 } 1469 1470 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) 1471 { 1472 return (__m128i)((__v8hi)a > (__v8hi)b); 1473 } 1474 1475 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) 1476 { 1477 return (__m128i)((__v4si)a > (__v4si)b); 1478 } 1479 1480 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi8(__m128i a, __m128i b) 1481 { 1482 return _mm_cmpgt_epi8(b, a); 1483 } 1484 1485 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi16(__m128i a, __m128i b) 1486 { 1487 return _mm_cmpgt_epi16(b, a); 1488 } 1489 1490 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi32(__m128i a, __m128i b) 1491 { 1492 return _mm_cmpgt_epi32(b, a); 1493 } 1494 1495 #ifdef _M_AMD64 1496 1497 __INTRIN_INLINE_SSE2 __m128d _mm_cvtsi64_sd(__m128d a, long long b) 1498 { 1499 a[0] = b; 1500 return a; 1501 } 1502 1503 __INTRIN_INLINE_SSE2 long long _mm_cvtsd_si64(__m128d a) 1504 { 1505 return __builtin_ia32_cvtsd2si64((__v2df)a); 1506 } 1507 1508 __INTRIN_INLINE_SSE2 long long _mm_cvttsd_si64(__m128d a) 1509 { 1510 return __builtin_ia32_cvttsd2si64((__v2df)a); 1511 } 1512 #endif 1513 1514 __INTRIN_INLINE_SSE2 __m128 _mm_cvtepi32_ps(__m128i a) 1515 { 1516 #if HAS_BUILTIN(__builtin_convertvector) 1517 return (__m128)__builtin_convertvector((__v4si)a, __v4sf); 1518 #else 1519 return __builtin_ia32_cvtdq2ps((__v4si)a); 1520 #endif 1521 } 1522 1523 __INTRIN_INLINE_SSE2 __m128i _mm_cvtps_epi32(__m128 a) 1524 { 1525 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)a); 1526 } 1527 1528 __INTRIN_INLINE_SSE2 __m128i _mm_cvttps_epi32(__m128 a) 1529 { 1530 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)a); 1531 } 1532 1533 __INTRIN_INLINE_SSE2 __m128i _mm_cvtsi32_si128(int a) 1534 { 1535 return __extension__(__m128i)(__v4si){a, 0, 0, 0}; 1536 } 1537 1538 __INTRIN_INLINE_SSE2 __m128i _mm_cvtsi64_si128(long long a) 1539 { 1540 return __extension__(__m128i)(__v2di){a, 0}; 1541 } 1542 1543 __INTRIN_INLINE_SSE2 int _mm_cvtsi128_si32(__m128i a) 1544 { 1545 __v4si b = (__v4si)a; 1546 return b[0]; 1547 } 1548 1549 __INTRIN_INLINE_SSE2 long long _mm_cvtsi128_si64(__m128i a) 1550 { 1551 return a[0]; 1552 } 1553 1554 __INTRIN_INLINE_SSE2 __m128i _mm_load_si128(__m128i const *p) 1555 { 1556 return *p; 1557 } 1558 1559 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si128(__m128i_u const *p) 1560 { 1561 struct __loadu_si128 { 1562 __m128i_u __v; 1563 } __attribute__((__packed__, __may_alias__)); 1564 return ((const struct __loadu_si128 *)p)->__v; 1565 } 1566 1567 __INTRIN_INLINE_SSE2 __m128i _mm_loadl_epi64(__m128i_u const *p) 1568 { 1569 struct __mm_loadl_epi64_struct { 1570 long long __u; 1571 } __attribute__((__packed__, __may_alias__)); 1572 return __extension__(__m128i){ 1573 ((const struct __mm_loadl_epi64_struct *)p)->__u, 0}; 1574 } 1575 1576 __INTRIN_INLINE_SSE2 __m128i _mm_undefined_si128(void) 1577 { 1578 #if HAS_BUILTIN(__builtin_ia32_undef128) 1579 return (__m128i)__builtin_ia32_undef128(); 1580 #else 1581 __m128i undef = undef; 1582 return undef; 1583 #endif 1584 } 1585 1586 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi64x(long long q1, long long q0) 1587 { 1588 return __extension__(__m128i)(__v2di){q0, q1}; 1589 } 1590 1591 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi64(__m64 q1, __m64 q0) 1592 { 1593 return _mm_set_epi64x((long long)q1, (long long)q0); 1594 } 1595 1596 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) 1597 { 1598 return __extension__(__m128i)(__v4si){i0, i1, i2, i3}; 1599 } 1600 1601 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi16( 1602 short w7, short w6, short w5, short w4, 1603 short w3, short w2, short w1, short w0) 1604 { 1605 return __extension__(__m128i)(__v8hi){w0, w1, w2, w3, w4, w5, w6, w7}; 1606 } 1607 1608 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi8( 1609 char b15, char b14, char b13, char b12, 1610 char b11, char b10, char b9, char b8, 1611 char b7, char b6, char b5, char b4, 1612 char b3, char b2, char b1, char b0) 1613 { 1614 return __extension__(__m128i)(__v16qi){ 1615 b0, b1, b2, b3, b4, b5, b6, b7, 1616 b8, b9, b10, b11, b12, b13, b14, b15}; 1617 } 1618 1619 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64x(long long q) 1620 { 1621 return _mm_set_epi64x(q, q); 1622 } 1623 1624 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64(__m64 q) 1625 { 1626 return _mm_set_epi64(q, q); 1627 } 1628 1629 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi32(int i) 1630 { 1631 return _mm_set_epi32(i, i, i, i); 1632 } 1633 1634 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi16(short w) 1635 { 1636 return _mm_set_epi16(w, w, w, w, w, w, w, w); 1637 } 1638 1639 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi8(char b) 1640 { 1641 return _mm_set_epi8(b, b, b, b, b, b, b, b, b, b, b, 1642 b, b, b, b, b); 1643 } 1644 1645 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi64(__m64 q0, __m64 q1) 1646 { 1647 return _mm_set_epi64(q1, q0); 1648 } 1649 1650 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3) 1651 { 1652 return _mm_set_epi32(i3, i2, i1, i0); 1653 } 1654 1655 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi16( 1656 short w0, short w1, short w2, short w3, 1657 short w4, short w5, short w6, short w7) 1658 { 1659 return _mm_set_epi16(w7, w6, w5, w4, w3, w2, w1, w0); 1660 } 1661 1662 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi8( 1663 char b0, char b1, char b2, char b3, 1664 char b4, char b5, char b6, char b7, 1665 char b8, char b9, char b10, char b11, 1666 char b12, char b13, char b14, char b15) 1667 { 1668 return _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, 1669 b7, b6, b5, b4, b3, b2, b1, b0); 1670 } 1671 1672 __INTRIN_INLINE_SSE2 __m128i _mm_setzero_si128(void) 1673 { 1674 return __extension__(__m128i)(__v2di){0LL, 0LL}; 1675 } 1676 1677 __INTRIN_INLINE_SSE2 void _mm_store_si128(__m128i *p, __m128i b) 1678 { 1679 *p = b; 1680 } 1681 1682 __INTRIN_INLINE_SSE2 void _mm_storeu_si128(__m128i_u *p, __m128i b) 1683 { 1684 struct __storeu_si128 { 1685 __m128i_u __v; 1686 } __attribute__((__packed__, __may_alias__)); 1687 ((struct __storeu_si128 *)p)->__v = b; 1688 } 1689 1690 __INTRIN_INLINE_SSE2 void _mm_storeu_si64(void *p, __m128i b) 1691 { 1692 struct __storeu_si64 { 1693 long long __v; 1694 } __attribute__((__packed__, __may_alias__)); 1695 ((struct __storeu_si64 *)p)->__v = ((__v2di)b)[0]; 1696 } 1697 1698 __INTRIN_INLINE_SSE2 void _mm_storeu_si32(void *p, __m128i b) 1699 { 1700 struct __storeu_si32 { 1701 int __v; 1702 } __attribute__((__packed__, __may_alias__)); 1703 ((struct __storeu_si32 *)p)->__v = ((__v4si)b)[0]; 1704 } 1705 1706 __INTRIN_INLINE_SSE2 void _mm_storeu_si16(void *p, __m128i b) 1707 { 1708 struct __storeu_si16 { 1709 short __v; 1710 } __attribute__((__packed__, __may_alias__)); 1711 ((struct __storeu_si16 *)p)->__v = ((__v8hi)b)[0]; 1712 } 1713 1714 __INTRIN_INLINE_SSE2 void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 1715 { 1716 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 1717 } 1718 1719 __INTRIN_INLINE_SSE2 void _mm_storel_epi64(__m128i_u *p, __m128i a) 1720 { 1721 struct __mm_storel_epi64_struct { 1722 long long __u; 1723 } __attribute__((__packed__, __may_alias__)); 1724 ((struct __mm_storel_epi64_struct *)p)->__u = a[0]; 1725 } 1726 1727 __INTRIN_INLINE_SSE2 void _mm_stream_pd(double *p, __m128d a) 1728 { 1729 #if HAS_BUILTIN(__builtin_nontemporal_store) 1730 __builtin_nontemporal_store((__v2df)a, (__v2df *)p); 1731 #else 1732 __builtin_ia32_movntpd(p, a); 1733 #endif 1734 } 1735 1736 __INTRIN_INLINE_SSE2 void _mm_stream_si128(__m128i *p, __m128i a) 1737 { 1738 #if HAS_BUILTIN(__builtin_nontemporal_store) 1739 __builtin_nontemporal_store((__v2di)a, (__v2di*)p); 1740 #else 1741 __builtin_ia32_movntdq(p, a); 1742 #endif 1743 } 1744 1745 __INTRIN_INLINE_SSE2 void _mm_stream_si32(int *p, int a) 1746 { 1747 __builtin_ia32_movnti(p, a); 1748 } 1749 1750 #ifdef _M_AMD64 1751 __INTRIN_INLINE_SSE2 void _mm_stream_si64(long long *p, long long a) 1752 { 1753 __builtin_ia32_movnti64(p, a); 1754 } 1755 #endif 1756 1757 void _mm_clflush(void const *p); 1758 1759 void _mm_lfence(void); 1760 1761 void _mm_mfence(void); 1762 1763 __INTRIN_INLINE_SSE2 __m128i _mm_packs_epi16(__m128i a, __m128i b) 1764 { 1765 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 1766 } 1767 1768 __INTRIN_INLINE_SSE2 __m128i _mm_packs_epi32(__m128i a, __m128i b) 1769 { 1770 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 1771 } 1772 1773 __INTRIN_INLINE_SSE2 __m128i _mm_packus_epi16(__m128i a, __m128i b) 1774 { 1775 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 1776 } 1777 1778 #define _mm_extract_epi16(a, imm) \ 1779 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ 1780 (int)(imm))) 1781 1782 #define _mm_insert_epi16(a, b, imm) \ 1783 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ 1784 (int)(imm))) 1785 1786 __INTRIN_INLINE_SSE2 int _mm_movemask_epi8(__m128i a) 1787 { 1788 return __builtin_ia32_pmovmskb128((__v16qi)a); 1789 } 1790 1791 #define _mm_shuffle_epi32(a, imm) \ 1792 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))) 1793 1794 #define _mm_shufflelo_epi16(a, imm) \ 1795 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))) 1796 1797 #define _mm_shufflehi_epi16(a, imm) \ 1798 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))) 1799 1800 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) 1801 { 1802 #if HAS_BUILTIN(__builtin_shufflevector) 1803 return (__m128i)__builtin_shufflevector( 1804 (__v16qi)a, (__v16qi)b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11, 1805 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15); 1806 #else 1807 return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)a, (__v16qi)b); 1808 #endif 1809 } 1810 1811 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) 1812 { 1813 #if HAS_BUILTIN(__builtin_shufflevector) 1814 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8 + 4, 5, 1815 8 + 5, 6, 8 + 6, 7, 8 + 7); 1816 #else 1817 return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)a, (__v8hi)b); 1818 #endif 1819 } 1820 1821 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) 1822 { 1823 #if HAS_BUILTIN(__builtin_shufflevector) 1824 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4 + 2, 3, 1825 4 + 3); 1826 #else 1827 return (__m128i)__builtin_ia32_punpckhdq128((__v4si)a, (__v4si)b); 1828 #endif 1829 } 1830 1831 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) 1832 { 1833 #if HAS_BUILTIN(__builtin_shufflevector) 1834 return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 1, 2 + 1); 1835 #else 1836 return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)a, (__v2di)b); 1837 #endif 1838 } 1839 1840 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) 1841 { 1842 #if HAS_BUILTIN(__builtin_shufflevector) 1843 return (__m128i)__builtin_shufflevector( 1844 (__v16qi)a, (__v16qi)b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4, 1845 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7); 1846 #else 1847 return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)a, (__v16qi)b); 1848 #endif 1849 } 1850 1851 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) 1852 { 1853 #if HAS_BUILTIN(__builtin_shufflevector) 1854 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8 + 0, 1, 1855 8 + 1, 2, 8 + 2, 3, 8 + 3); 1856 #else 1857 return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)a, (__v8hi)b); 1858 #endif 1859 } 1860 1861 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) 1862 { 1863 #if HAS_BUILTIN(__builtin_shufflevector) 1864 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4 + 0, 1, 1865 4 + 1); 1866 #else 1867 return (__m128i)__builtin_ia32_punpckldq128((__v4si)a, (__v4si)b); 1868 #endif 1869 } 1870 1871 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) 1872 { 1873 #if HAS_BUILTIN(__builtin_shufflevector) 1874 return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 0, 2 + 0); 1875 #else 1876 return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)a, (__v2di)b); 1877 #endif 1878 } 1879 1880 __INTRIN_INLINE_SSE2 __m64 _mm_movepi64_pi64(__m128i a) 1881 { 1882 return (__m64)a[0]; 1883 } 1884 1885 __INTRIN_INLINE_SSE2 __m128i _mm_movpi64_epi64(__m64 a) 1886 { 1887 return __extension__(__m128i)(__v2di){(long long)a, 0}; 1888 } 1889 1890 __INTRIN_INLINE_SSE2 __m128i _mm_move_epi64(__m128i a) 1891 { 1892 #if HAS_BUILTIN(__builtin_shufflevector) 1893 return __builtin_shufflevector((__v2di)a, _mm_setzero_si128(), 0, 2); 1894 #else 1895 return (__m128i)__builtin_ia32_movq128((__v2di)a); 1896 #endif 1897 } 1898 1899 __INTRIN_INLINE_SSE2 __m128d _mm_unpackhi_pd(__m128d a, __m128d b) 1900 { 1901 #if HAS_BUILTIN(__builtin_shufflevector) 1902 return __builtin_shufflevector((__v2df)a, (__v2df)b, 1, 2 + 1); 1903 #else 1904 return (__m128d)__builtin_ia32_unpckhpd((__v2df)a, (__v2df)b); 1905 #endif 1906 } 1907 1908 __INTRIN_INLINE_SSE2 __m128d _mm_unpacklo_pd(__m128d a, __m128d b) 1909 { 1910 #if HAS_BUILTIN(__builtin_shufflevector) 1911 return __builtin_shufflevector((__v2df)a, (__v2df)b, 0, 2 + 0); 1912 #else 1913 return (__m128d)__builtin_ia32_unpcklpd((__v2df)a, (__v2df)b); 1914 #endif 1915 } 1916 1917 __INTRIN_INLINE_SSE2 int _mm_movemask_pd(__m128d a) 1918 { 1919 return __builtin_ia32_movmskpd((__v2df)a); 1920 } 1921 1922 #define _mm_shuffle_pd(a, b, i) \ 1923 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 1924 (int)(i))) 1925 1926 __INTRIN_INLINE_SSE2 __m128 _mm_castpd_ps(__m128d a) 1927 { 1928 return (__m128)a; 1929 } 1930 1931 __INTRIN_INLINE_SSE2 __m128i _mm_castpd_si128(__m128d a) 1932 { 1933 return (__m128i)a; 1934 } 1935 1936 __INTRIN_INLINE_SSE2 __m128d _mm_castps_pd(__m128 a) 1937 { 1938 return (__m128d)a; 1939 } 1940 1941 __INTRIN_INLINE_SSE2 __m128i _mm_castps_si128(__m128 a) 1942 { 1943 return (__m128i)a; 1944 } 1945 1946 __INTRIN_INLINE_SSE2 __m128 _mm_castsi128_ps(__m128i a) 1947 { 1948 return (__m128)a; 1949 } 1950 1951 __INTRIN_INLINE_SSE2 __m128d _mm_castsi128_pd(__m128i a) 1952 { 1953 return (__m128d)a; 1954 } 1955 1956 void _mm_pause(void); 1957 1958 #endif /* _MSC_VER */ 1959 1960 #ifdef __cplusplus 1961 } // extern "C" 1962 #endif 1963 1964 #endif /* _INCLUDED_EMM */ 1965