1 /* Copyright (C) 2003-2018 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27 #ifndef _EMMINTRIN_H_INCLUDED 28 #define _EMMINTRIN_H_INCLUDED 29 30 /* We need definitions from the SSE header files*/ 31 #include <xmmintrin.h> 32 33 #ifndef __SSE2__ 34 #pragma GCC push_options 35 #pragma GCC target("sse2") 36 #define __DISABLE_SSE2__ 37 #endif /* __SSE2__ */ 38 39 /* SSE2 */ 40 typedef double __v2df __attribute__ ((__vector_size__ (16))); 41 typedef long long __v2di __attribute__ ((__vector_size__ (16))); 42 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); 43 typedef int __v4si __attribute__ ((__vector_size__ (16))); 44 typedef unsigned int __v4su __attribute__ ((__vector_size__ (16))); 45 typedef short __v8hi __attribute__ ((__vector_size__ (16))); 46 typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16))); 47 typedef char __v16qi __attribute__ ((__vector_size__ (16))); 48 typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16))); 49 50 /* The Intel API is flexible enough that we must allow aliasing with other 51 vector types, and their scalar components. */ 52 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); 53 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); 54 55 /* Unaligned version of the same types. */ 56 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 57 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 58 59 /* Create a selector for use with the SHUFPD instruction. */ 60 #define _MM_SHUFFLE2(fp1,fp0) \ 61 (((fp1) << 1) | (fp0)) 62 63 /* Create a vector with element 0 as F and the rest zero. */ 64 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 65 _mm_set_sd (double __F) 66 { 67 return __extension__ (__m128d){ __F, 0.0 }; 68 } 69 70 /* Create a vector with both elements equal to F. */ 71 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 72 _mm_set1_pd (double __F) 73 { 74 return __extension__ (__m128d){ __F, __F }; 75 } 76 77 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 78 _mm_set_pd1 (double __F) 79 { 80 return _mm_set1_pd (__F); 81 } 82 83 /* Create a vector with the lower value X and upper value W. */ 84 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 85 _mm_set_pd (double __W, double __X) 86 { 87 return __extension__ (__m128d){ __X, __W }; 88 } 89 90 /* Create a vector with the lower value W and upper value X. */ 91 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 92 _mm_setr_pd (double __W, double __X) 93 { 94 return __extension__ (__m128d){ __W, __X }; 95 } 96 97 /* Create an undefined vector. */ 98 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 99 _mm_undefined_pd (void) 100 { 101 __m128d __Y = __Y; 102 return __Y; 103 } 104 105 /* Create a vector of zeros. */ 106 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 107 _mm_setzero_pd (void) 108 { 109 return __extension__ (__m128d){ 0.0, 0.0 }; 110 } 111 112 /* Sets the low DPFP value of A from the low value of B. */ 113 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 114 _mm_move_sd (__m128d __A, __m128d __B) 115 { 116 return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); 117 } 118 119 /* Load two DPFP values from P. The address must be 16-byte aligned. */ 120 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 121 _mm_load_pd (double const *__P) 122 { 123 return *(__m128d *)__P; 124 } 125 126 /* Load two DPFP values from P. The address need not be 16-byte aligned. */ 127 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 128 _mm_loadu_pd (double const *__P) 129 { 130 return *(__m128d_u *)__P; 131 } 132 133 /* Create a vector with all two elements equal to *P. */ 134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 135 _mm_load1_pd (double const *__P) 136 { 137 return _mm_set1_pd (*__P); 138 } 139 140 /* Create a vector with element 0 as *P and the rest zero. */ 141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 142 _mm_load_sd (double const *__P) 143 { 144 return _mm_set_sd (*__P); 145 } 146 147 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 148 _mm_load_pd1 (double const *__P) 149 { 150 return _mm_load1_pd (__P); 151 } 152 153 /* Load two DPFP values in reverse order. The address must be aligned. */ 154 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 155 _mm_loadr_pd (double const *__P) 156 { 157 __m128d __tmp = _mm_load_pd (__P); 158 return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1)); 159 } 160 161 /* Store two DPFP values. The address must be 16-byte aligned. */ 162 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 163 _mm_store_pd (double *__P, __m128d __A) 164 { 165 *(__m128d *)__P = __A; 166 } 167 168 /* Store two DPFP values. The address need not be 16-byte aligned. */ 169 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 170 _mm_storeu_pd (double *__P, __m128d __A) 171 { 172 *(__m128d_u *)__P = __A; 173 } 174 175 /* Stores the lower DPFP value. */ 176 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 177 _mm_store_sd (double *__P, __m128d __A) 178 { 179 *__P = ((__v2df)__A)[0]; 180 } 181 182 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 183 _mm_cvtsd_f64 (__m128d __A) 184 { 185 return ((__v2df)__A)[0]; 186 } 187 188 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 189 _mm_storel_pd (double *__P, __m128d __A) 190 { 191 _mm_store_sd (__P, __A); 192 } 193 194 /* Stores the upper DPFP value. */ 195 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 196 _mm_storeh_pd (double *__P, __m128d __A) 197 { 198 *__P = ((__v2df)__A)[1]; 199 } 200 201 /* Store the lower DPFP value across two words. 202 The address must be 16-byte aligned. */ 203 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 204 _mm_store1_pd (double *__P, __m128d __A) 205 { 206 _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0))); 207 } 208 209 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 210 _mm_store_pd1 (double *__P, __m128d __A) 211 { 212 _mm_store1_pd (__P, __A); 213 } 214 215 /* Store two DPFP values in reverse order. The address must be aligned. */ 216 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 217 _mm_storer_pd (double *__P, __m128d __A) 218 { 219 _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1))); 220 } 221 222 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 223 _mm_cvtsi128_si32 (__m128i __A) 224 { 225 return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); 226 } 227 228 #ifdef __x86_64__ 229 /* Intel intrinsic. */ 230 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 231 _mm_cvtsi128_si64 (__m128i __A) 232 { 233 return ((__v2di)__A)[0]; 234 } 235 236 /* Microsoft intrinsic. */ 237 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 238 _mm_cvtsi128_si64x (__m128i __A) 239 { 240 return ((__v2di)__A)[0]; 241 } 242 #endif 243 244 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 245 _mm_add_pd (__m128d __A, __m128d __B) 246 { 247 return (__m128d) ((__v2df)__A + (__v2df)__B); 248 } 249 250 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 251 _mm_add_sd (__m128d __A, __m128d __B) 252 { 253 return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); 254 } 255 256 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 257 _mm_sub_pd (__m128d __A, __m128d __B) 258 { 259 return (__m128d) ((__v2df)__A - (__v2df)__B); 260 } 261 262 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 263 _mm_sub_sd (__m128d __A, __m128d __B) 264 { 265 return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); 266 } 267 268 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 269 _mm_mul_pd (__m128d __A, __m128d __B) 270 { 271 return (__m128d) ((__v2df)__A * (__v2df)__B); 272 } 273 274 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 275 _mm_mul_sd (__m128d __A, __m128d __B) 276 { 277 return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); 278 } 279 280 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 281 _mm_div_pd (__m128d __A, __m128d __B) 282 { 283 return (__m128d) ((__v2df)__A / (__v2df)__B); 284 } 285 286 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 287 _mm_div_sd (__m128d __A, __m128d __B) 288 { 289 return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); 290 } 291 292 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 293 _mm_sqrt_pd (__m128d __A) 294 { 295 return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); 296 } 297 298 /* Return pair {sqrt (B[0]), A[1]}. */ 299 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 300 _mm_sqrt_sd (__m128d __A, __m128d __B) 301 { 302 __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); 303 return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); 304 } 305 306 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 307 _mm_min_pd (__m128d __A, __m128d __B) 308 { 309 return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B); 310 } 311 312 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 313 _mm_min_sd (__m128d __A, __m128d __B) 314 { 315 return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); 316 } 317 318 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 319 _mm_max_pd (__m128d __A, __m128d __B) 320 { 321 return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); 322 } 323 324 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 325 _mm_max_sd (__m128d __A, __m128d __B) 326 { 327 return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B); 328 } 329 330 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 331 _mm_and_pd (__m128d __A, __m128d __B) 332 { 333 return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B); 334 } 335 336 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 337 _mm_andnot_pd (__m128d __A, __m128d __B) 338 { 339 return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B); 340 } 341 342 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 343 _mm_or_pd (__m128d __A, __m128d __B) 344 { 345 return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B); 346 } 347 348 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 349 _mm_xor_pd (__m128d __A, __m128d __B) 350 { 351 return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); 352 } 353 354 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 355 _mm_cmpeq_pd (__m128d __A, __m128d __B) 356 { 357 return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); 358 } 359 360 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 361 _mm_cmplt_pd (__m128d __A, __m128d __B) 362 { 363 return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); 364 } 365 366 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 367 _mm_cmple_pd (__m128d __A, __m128d __B) 368 { 369 return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); 370 } 371 372 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 373 _mm_cmpgt_pd (__m128d __A, __m128d __B) 374 { 375 return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); 376 } 377 378 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 379 _mm_cmpge_pd (__m128d __A, __m128d __B) 380 { 381 return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); 382 } 383 384 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 385 _mm_cmpneq_pd (__m128d __A, __m128d __B) 386 { 387 return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); 388 } 389 390 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 391 _mm_cmpnlt_pd (__m128d __A, __m128d __B) 392 { 393 return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); 394 } 395 396 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 397 _mm_cmpnle_pd (__m128d __A, __m128d __B) 398 { 399 return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B); 400 } 401 402 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 403 _mm_cmpngt_pd (__m128d __A, __m128d __B) 404 { 405 return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B); 406 } 407 408 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 409 _mm_cmpnge_pd (__m128d __A, __m128d __B) 410 { 411 return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B); 412 } 413 414 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 415 _mm_cmpord_pd (__m128d __A, __m128d __B) 416 { 417 return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B); 418 } 419 420 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 421 _mm_cmpunord_pd (__m128d __A, __m128d __B) 422 { 423 return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B); 424 } 425 426 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 427 _mm_cmpeq_sd (__m128d __A, __m128d __B) 428 { 429 return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B); 430 } 431 432 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 433 _mm_cmplt_sd (__m128d __A, __m128d __B) 434 { 435 return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B); 436 } 437 438 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 439 _mm_cmple_sd (__m128d __A, __m128d __B) 440 { 441 return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B); 442 } 443 444 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 445 _mm_cmpgt_sd (__m128d __A, __m128d __B) 446 { 447 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 448 (__v2df) 449 __builtin_ia32_cmpltsd ((__v2df) __B, 450 (__v2df) 451 __A)); 452 } 453 454 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 455 _mm_cmpge_sd (__m128d __A, __m128d __B) 456 { 457 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 458 (__v2df) 459 __builtin_ia32_cmplesd ((__v2df) __B, 460 (__v2df) 461 __A)); 462 } 463 464 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 465 _mm_cmpneq_sd (__m128d __A, __m128d __B) 466 { 467 return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B); 468 } 469 470 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 471 _mm_cmpnlt_sd (__m128d __A, __m128d __B) 472 { 473 return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B); 474 } 475 476 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 477 _mm_cmpnle_sd (__m128d __A, __m128d __B) 478 { 479 return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B); 480 } 481 482 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 483 _mm_cmpngt_sd (__m128d __A, __m128d __B) 484 { 485 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 486 (__v2df) 487 __builtin_ia32_cmpnltsd ((__v2df) __B, 488 (__v2df) 489 __A)); 490 } 491 492 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 493 _mm_cmpnge_sd (__m128d __A, __m128d __B) 494 { 495 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 496 (__v2df) 497 __builtin_ia32_cmpnlesd ((__v2df) __B, 498 (__v2df) 499 __A)); 500 } 501 502 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 503 _mm_cmpord_sd (__m128d __A, __m128d __B) 504 { 505 return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B); 506 } 507 508 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 509 _mm_cmpunord_sd (__m128d __A, __m128d __B) 510 { 511 return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B); 512 } 513 514 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 515 _mm_comieq_sd (__m128d __A, __m128d __B) 516 { 517 return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B); 518 } 519 520 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 521 _mm_comilt_sd (__m128d __A, __m128d __B) 522 { 523 return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); 524 } 525 526 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 527 _mm_comile_sd (__m128d __A, __m128d __B) 528 { 529 return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B); 530 } 531 532 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 533 _mm_comigt_sd (__m128d __A, __m128d __B) 534 { 535 return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B); 536 } 537 538 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 539 _mm_comige_sd (__m128d __A, __m128d __B) 540 { 541 return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B); 542 } 543 544 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 545 _mm_comineq_sd (__m128d __A, __m128d __B) 546 { 547 return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B); 548 } 549 550 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 551 _mm_ucomieq_sd (__m128d __A, __m128d __B) 552 { 553 return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B); 554 } 555 556 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 557 _mm_ucomilt_sd (__m128d __A, __m128d __B) 558 { 559 return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B); 560 } 561 562 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 563 _mm_ucomile_sd (__m128d __A, __m128d __B) 564 { 565 return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B); 566 } 567 568 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 569 _mm_ucomigt_sd (__m128d __A, __m128d __B) 570 { 571 return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B); 572 } 573 574 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 575 _mm_ucomige_sd (__m128d __A, __m128d __B) 576 { 577 return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B); 578 } 579 580 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 581 _mm_ucomineq_sd (__m128d __A, __m128d __B) 582 { 583 return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); 584 } 585 586 /* Create a vector of Qi, where i is the element number. */ 587 588 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 589 _mm_set_epi64x (long long __q1, long long __q0) 590 { 591 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 592 } 593 594 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 595 _mm_set_epi64 (__m64 __q1, __m64 __q0) 596 { 597 return _mm_set_epi64x ((long long)__q1, (long long)__q0); 598 } 599 600 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 601 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) 602 { 603 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; 604 } 605 606 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 607 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, 608 short __q3, short __q2, short __q1, short __q0) 609 { 610 return __extension__ (__m128i)(__v8hi){ 611 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; 612 } 613 614 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 615 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, 616 char __q11, char __q10, char __q09, char __q08, 617 char __q07, char __q06, char __q05, char __q04, 618 char __q03, char __q02, char __q01, char __q00) 619 { 620 return __extension__ (__m128i)(__v16qi){ 621 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 622 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 623 }; 624 } 625 626 /* Set all of the elements of the vector to A. */ 627 628 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 629 _mm_set1_epi64x (long long __A) 630 { 631 return _mm_set_epi64x (__A, __A); 632 } 633 634 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 635 _mm_set1_epi64 (__m64 __A) 636 { 637 return _mm_set_epi64 (__A, __A); 638 } 639 640 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 641 _mm_set1_epi32 (int __A) 642 { 643 return _mm_set_epi32 (__A, __A, __A, __A); 644 } 645 646 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 647 _mm_set1_epi16 (short __A) 648 { 649 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); 650 } 651 652 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 653 _mm_set1_epi8 (char __A) 654 { 655 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 656 __A, __A, __A, __A, __A, __A, __A, __A); 657 } 658 659 /* Create a vector of Qi, where i is the element number. 660 The parameter order is reversed from the _mm_set_epi* functions. */ 661 662 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 663 _mm_setr_epi64 (__m64 __q0, __m64 __q1) 664 { 665 return _mm_set_epi64 (__q1, __q0); 666 } 667 668 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 669 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) 670 { 671 return _mm_set_epi32 (__q3, __q2, __q1, __q0); 672 } 673 674 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 675 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, 676 short __q4, short __q5, short __q6, short __q7) 677 { 678 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 679 } 680 681 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 682 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, 683 char __q04, char __q05, char __q06, char __q07, 684 char __q08, char __q09, char __q10, char __q11, 685 char __q12, char __q13, char __q14, char __q15) 686 { 687 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 688 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 689 } 690 691 /* Create a vector with element 0 as *P and the rest zero. */ 692 693 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 694 _mm_load_si128 (__m128i const *__P) 695 { 696 return *__P; 697 } 698 699 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 700 _mm_loadu_si128 (__m128i_u const *__P) 701 { 702 return *__P; 703 } 704 705 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 706 _mm_loadl_epi64 (__m128i_u const *__P) 707 { 708 return _mm_set_epi64 ((__m64)0LL, *(__m64_u *)__P); 709 } 710 711 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 712 _mm_store_si128 (__m128i *__P, __m128i __B) 713 { 714 *__P = __B; 715 } 716 717 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 718 _mm_storeu_si128 (__m128i_u *__P, __m128i __B) 719 { 720 *__P = __B; 721 } 722 723 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 724 _mm_storel_epi64 (__m128i_u *__P, __m128i __B) 725 { 726 *(__m64_u *)__P = (__m64) ((__v2di)__B)[0]; 727 } 728 729 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 730 _mm_movepi64_pi64 (__m128i __B) 731 { 732 return (__m64) ((__v2di)__B)[0]; 733 } 734 735 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 736 _mm_movpi64_epi64 (__m64 __A) 737 { 738 return _mm_set_epi64 ((__m64)0LL, __A); 739 } 740 741 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 742 _mm_move_epi64 (__m128i __A) 743 { 744 return (__m128i)__builtin_ia32_movq128 ((__v2di) __A); 745 } 746 747 /* Create an undefined vector. */ 748 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 749 _mm_undefined_si128 (void) 750 { 751 __m128i __Y = __Y; 752 return __Y; 753 } 754 755 /* Create a vector of zeros. */ 756 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 757 _mm_setzero_si128 (void) 758 { 759 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; 760 } 761 762 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 763 _mm_cvtepi32_pd (__m128i __A) 764 { 765 return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A); 766 } 767 768 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 769 _mm_cvtepi32_ps (__m128i __A) 770 { 771 return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A); 772 } 773 774 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 775 _mm_cvtpd_epi32 (__m128d __A) 776 { 777 return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A); 778 } 779 780 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 781 _mm_cvtpd_pi32 (__m128d __A) 782 { 783 return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A); 784 } 785 786 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 787 _mm_cvtpd_ps (__m128d __A) 788 { 789 return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A); 790 } 791 792 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 793 _mm_cvttpd_epi32 (__m128d __A) 794 { 795 return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A); 796 } 797 798 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 799 _mm_cvttpd_pi32 (__m128d __A) 800 { 801 return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A); 802 } 803 804 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 805 _mm_cvtpi32_pd (__m64 __A) 806 { 807 return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A); 808 } 809 810 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 811 _mm_cvtps_epi32 (__m128 __A) 812 { 813 return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A); 814 } 815 816 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 817 _mm_cvttps_epi32 (__m128 __A) 818 { 819 return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A); 820 } 821 822 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 823 _mm_cvtps_pd (__m128 __A) 824 { 825 return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A); 826 } 827 828 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 829 _mm_cvtsd_si32 (__m128d __A) 830 { 831 return __builtin_ia32_cvtsd2si ((__v2df) __A); 832 } 833 834 #ifdef __x86_64__ 835 /* Intel intrinsic. */ 836 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 837 _mm_cvtsd_si64 (__m128d __A) 838 { 839 return __builtin_ia32_cvtsd2si64 ((__v2df) __A); 840 } 841 842 /* Microsoft intrinsic. */ 843 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 844 _mm_cvtsd_si64x (__m128d __A) 845 { 846 return __builtin_ia32_cvtsd2si64 ((__v2df) __A); 847 } 848 #endif 849 850 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 851 _mm_cvttsd_si32 (__m128d __A) 852 { 853 return __builtin_ia32_cvttsd2si ((__v2df) __A); 854 } 855 856 #ifdef __x86_64__ 857 /* Intel intrinsic. */ 858 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 859 _mm_cvttsd_si64 (__m128d __A) 860 { 861 return __builtin_ia32_cvttsd2si64 ((__v2df) __A); 862 } 863 864 /* Microsoft intrinsic. */ 865 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 866 _mm_cvttsd_si64x (__m128d __A) 867 { 868 return __builtin_ia32_cvttsd2si64 ((__v2df) __A); 869 } 870 #endif 871 872 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 873 _mm_cvtsd_ss (__m128 __A, __m128d __B) 874 { 875 return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B); 876 } 877 878 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 879 _mm_cvtsi32_sd (__m128d __A, int __B) 880 { 881 return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B); 882 } 883 884 #ifdef __x86_64__ 885 /* Intel intrinsic. */ 886 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 887 _mm_cvtsi64_sd (__m128d __A, long long __B) 888 { 889 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); 890 } 891 892 /* Microsoft intrinsic. */ 893 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 894 _mm_cvtsi64x_sd (__m128d __A, long long __B) 895 { 896 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); 897 } 898 #endif 899 900 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 901 _mm_cvtss_sd (__m128d __A, __m128 __B) 902 { 903 return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); 904 } 905 906 #ifdef __OPTIMIZE__ 907 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 908 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) 909 { 910 return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask); 911 } 912 #else 913 #define _mm_shuffle_pd(A, B, N) \ 914 ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A), \ 915 (__v2df)(__m128d)(B), (int)(N))) 916 #endif 917 918 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 919 _mm_unpackhi_pd (__m128d __A, __m128d __B) 920 { 921 return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); 922 } 923 924 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 925 _mm_unpacklo_pd (__m128d __A, __m128d __B) 926 { 927 return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); 928 } 929 930 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 931 _mm_loadh_pd (__m128d __A, double const *__B) 932 { 933 return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B); 934 } 935 936 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 937 _mm_loadl_pd (__m128d __A, double const *__B) 938 { 939 return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B); 940 } 941 942 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 943 _mm_movemask_pd (__m128d __A) 944 { 945 return __builtin_ia32_movmskpd ((__v2df)__A); 946 } 947 948 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 949 _mm_packs_epi16 (__m128i __A, __m128i __B) 950 { 951 return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); 952 } 953 954 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 955 _mm_packs_epi32 (__m128i __A, __m128i __B) 956 { 957 return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); 958 } 959 960 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 961 _mm_packus_epi16 (__m128i __A, __m128i __B) 962 { 963 return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); 964 } 965 966 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 967 _mm_unpackhi_epi8 (__m128i __A, __m128i __B) 968 { 969 return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); 970 } 971 972 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 973 _mm_unpackhi_epi16 (__m128i __A, __m128i __B) 974 { 975 return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); 976 } 977 978 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 979 _mm_unpackhi_epi32 (__m128i __A, __m128i __B) 980 { 981 return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); 982 } 983 984 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 985 _mm_unpackhi_epi64 (__m128i __A, __m128i __B) 986 { 987 return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B); 988 } 989 990 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 991 _mm_unpacklo_epi8 (__m128i __A, __m128i __B) 992 { 993 return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); 994 } 995 996 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 997 _mm_unpacklo_epi16 (__m128i __A, __m128i __B) 998 { 999 return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); 1000 } 1001 1002 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1003 _mm_unpacklo_epi32 (__m128i __A, __m128i __B) 1004 { 1005 return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); 1006 } 1007 1008 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1009 _mm_unpacklo_epi64 (__m128i __A, __m128i __B) 1010 { 1011 return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); 1012 } 1013 1014 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1015 _mm_add_epi8 (__m128i __A, __m128i __B) 1016 { 1017 return (__m128i) ((__v16qu)__A + (__v16qu)__B); 1018 } 1019 1020 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1021 _mm_add_epi16 (__m128i __A, __m128i __B) 1022 { 1023 return (__m128i) ((__v8hu)__A + (__v8hu)__B); 1024 } 1025 1026 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1027 _mm_add_epi32 (__m128i __A, __m128i __B) 1028 { 1029 return (__m128i) ((__v4su)__A + (__v4su)__B); 1030 } 1031 1032 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1033 _mm_add_epi64 (__m128i __A, __m128i __B) 1034 { 1035 return (__m128i) ((__v2du)__A + (__v2du)__B); 1036 } 1037 1038 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1039 _mm_adds_epi8 (__m128i __A, __m128i __B) 1040 { 1041 return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); 1042 } 1043 1044 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1045 _mm_adds_epi16 (__m128i __A, __m128i __B) 1046 { 1047 return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); 1048 } 1049 1050 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1051 _mm_adds_epu8 (__m128i __A, __m128i __B) 1052 { 1053 return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); 1054 } 1055 1056 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1057 _mm_adds_epu16 (__m128i __A, __m128i __B) 1058 { 1059 return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); 1060 } 1061 1062 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1063 _mm_sub_epi8 (__m128i __A, __m128i __B) 1064 { 1065 return (__m128i) ((__v16qu)__A - (__v16qu)__B); 1066 } 1067 1068 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1069 _mm_sub_epi16 (__m128i __A, __m128i __B) 1070 { 1071 return (__m128i) ((__v8hu)__A - (__v8hu)__B); 1072 } 1073 1074 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1075 _mm_sub_epi32 (__m128i __A, __m128i __B) 1076 { 1077 return (__m128i) ((__v4su)__A - (__v4su)__B); 1078 } 1079 1080 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1081 _mm_sub_epi64 (__m128i __A, __m128i __B) 1082 { 1083 return (__m128i) ((__v2du)__A - (__v2du)__B); 1084 } 1085 1086 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1087 _mm_subs_epi8 (__m128i __A, __m128i __B) 1088 { 1089 return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); 1090 } 1091 1092 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1093 _mm_subs_epi16 (__m128i __A, __m128i __B) 1094 { 1095 return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); 1096 } 1097 1098 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1099 _mm_subs_epu8 (__m128i __A, __m128i __B) 1100 { 1101 return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); 1102 } 1103 1104 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1105 _mm_subs_epu16 (__m128i __A, __m128i __B) 1106 { 1107 return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); 1108 } 1109 1110 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1111 _mm_madd_epi16 (__m128i __A, __m128i __B) 1112 { 1113 return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); 1114 } 1115 1116 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1117 _mm_mulhi_epi16 (__m128i __A, __m128i __B) 1118 { 1119 return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); 1120 } 1121 1122 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1123 _mm_mullo_epi16 (__m128i __A, __m128i __B) 1124 { 1125 return (__m128i) ((__v8hu)__A * (__v8hu)__B); 1126 } 1127 1128 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1129 _mm_mul_su32 (__m64 __A, __m64 __B) 1130 { 1131 return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); 1132 } 1133 1134 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1135 _mm_mul_epu32 (__m128i __A, __m128i __B) 1136 { 1137 return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); 1138 } 1139 1140 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1141 _mm_slli_epi16 (__m128i __A, int __B) 1142 { 1143 return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); 1144 } 1145 1146 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1147 _mm_slli_epi32 (__m128i __A, int __B) 1148 { 1149 return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); 1150 } 1151 1152 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1153 _mm_slli_epi64 (__m128i __A, int __B) 1154 { 1155 return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); 1156 } 1157 1158 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1159 _mm_srai_epi16 (__m128i __A, int __B) 1160 { 1161 return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); 1162 } 1163 1164 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1165 _mm_srai_epi32 (__m128i __A, int __B) 1166 { 1167 return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); 1168 } 1169 1170 #ifdef __OPTIMIZE__ 1171 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1172 _mm_bsrli_si128 (__m128i __A, const int __N) 1173 { 1174 return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); 1175 } 1176 1177 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1178 _mm_bslli_si128 (__m128i __A, const int __N) 1179 { 1180 return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); 1181 } 1182 1183 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1184 _mm_srli_si128 (__m128i __A, const int __N) 1185 { 1186 return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); 1187 } 1188 1189 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1190 _mm_slli_si128 (__m128i __A, const int __N) 1191 { 1192 return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); 1193 } 1194 #else 1195 #define _mm_bsrli_si128(A, N) \ 1196 ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8)) 1197 #define _mm_bslli_si128(A, N) \ 1198 ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8)) 1199 #define _mm_srli_si128(A, N) \ 1200 ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8)) 1201 #define _mm_slli_si128(A, N) \ 1202 ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8)) 1203 #endif 1204 1205 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1206 _mm_srli_epi16 (__m128i __A, int __B) 1207 { 1208 return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); 1209 } 1210 1211 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1212 _mm_srli_epi32 (__m128i __A, int __B) 1213 { 1214 return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); 1215 } 1216 1217 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1218 _mm_srli_epi64 (__m128i __A, int __B) 1219 { 1220 return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); 1221 } 1222 1223 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1224 _mm_sll_epi16 (__m128i __A, __m128i __B) 1225 { 1226 return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B); 1227 } 1228 1229 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1230 _mm_sll_epi32 (__m128i __A, __m128i __B) 1231 { 1232 return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B); 1233 } 1234 1235 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1236 _mm_sll_epi64 (__m128i __A, __m128i __B) 1237 { 1238 return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B); 1239 } 1240 1241 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1242 _mm_sra_epi16 (__m128i __A, __m128i __B) 1243 { 1244 return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B); 1245 } 1246 1247 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1248 _mm_sra_epi32 (__m128i __A, __m128i __B) 1249 { 1250 return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B); 1251 } 1252 1253 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1254 _mm_srl_epi16 (__m128i __A, __m128i __B) 1255 { 1256 return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B); 1257 } 1258 1259 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1260 _mm_srl_epi32 (__m128i __A, __m128i __B) 1261 { 1262 return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B); 1263 } 1264 1265 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1266 _mm_srl_epi64 (__m128i __A, __m128i __B) 1267 { 1268 return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B); 1269 } 1270 1271 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1272 _mm_and_si128 (__m128i __A, __m128i __B) 1273 { 1274 return (__m128i) ((__v2du)__A & (__v2du)__B); 1275 } 1276 1277 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1278 _mm_andnot_si128 (__m128i __A, __m128i __B) 1279 { 1280 return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); 1281 } 1282 1283 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1284 _mm_or_si128 (__m128i __A, __m128i __B) 1285 { 1286 return (__m128i) ((__v2du)__A | (__v2du)__B); 1287 } 1288 1289 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1290 _mm_xor_si128 (__m128i __A, __m128i __B) 1291 { 1292 return (__m128i) ((__v2du)__A ^ (__v2du)__B); 1293 } 1294 1295 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1296 _mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1297 { 1298 return (__m128i) ((__v16qi)__A == (__v16qi)__B); 1299 } 1300 1301 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1302 _mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1303 { 1304 return (__m128i) ((__v8hi)__A == (__v8hi)__B); 1305 } 1306 1307 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1308 _mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1309 { 1310 return (__m128i) ((__v4si)__A == (__v4si)__B); 1311 } 1312 1313 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1314 _mm_cmplt_epi8 (__m128i __A, __m128i __B) 1315 { 1316 return (__m128i) ((__v16qi)__A < (__v16qi)__B); 1317 } 1318 1319 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1320 _mm_cmplt_epi16 (__m128i __A, __m128i __B) 1321 { 1322 return (__m128i) ((__v8hi)__A < (__v8hi)__B); 1323 } 1324 1325 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1326 _mm_cmplt_epi32 (__m128i __A, __m128i __B) 1327 { 1328 return (__m128i) ((__v4si)__A < (__v4si)__B); 1329 } 1330 1331 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1332 _mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1333 { 1334 return (__m128i) ((__v16qi)__A > (__v16qi)__B); 1335 } 1336 1337 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1338 _mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1339 { 1340 return (__m128i) ((__v8hi)__A > (__v8hi)__B); 1341 } 1342 1343 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1344 _mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1345 { 1346 return (__m128i) ((__v4si)__A > (__v4si)__B); 1347 } 1348 1349 #ifdef __OPTIMIZE__ 1350 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1351 _mm_extract_epi16 (__m128i const __A, int const __N) 1352 { 1353 return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N); 1354 } 1355 1356 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1357 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N) 1358 { 1359 return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N); 1360 } 1361 #else 1362 #define _mm_extract_epi16(A, N) \ 1363 ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N))) 1364 #define _mm_insert_epi16(A, D, N) \ 1365 ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A), \ 1366 (int)(D), (int)(N))) 1367 #endif 1368 1369 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1370 _mm_max_epi16 (__m128i __A, __m128i __B) 1371 { 1372 return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); 1373 } 1374 1375 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1376 _mm_max_epu8 (__m128i __A, __m128i __B) 1377 { 1378 return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); 1379 } 1380 1381 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1382 _mm_min_epi16 (__m128i __A, __m128i __B) 1383 { 1384 return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); 1385 } 1386 1387 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1388 _mm_min_epu8 (__m128i __A, __m128i __B) 1389 { 1390 return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); 1391 } 1392 1393 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1394 _mm_movemask_epi8 (__m128i __A) 1395 { 1396 return __builtin_ia32_pmovmskb128 ((__v16qi)__A); 1397 } 1398 1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1400 _mm_mulhi_epu16 (__m128i __A, __m128i __B) 1401 { 1402 return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); 1403 } 1404 1405 #ifdef __OPTIMIZE__ 1406 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1407 _mm_shufflehi_epi16 (__m128i __A, const int __mask) 1408 { 1409 return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask); 1410 } 1411 1412 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1413 _mm_shufflelo_epi16 (__m128i __A, const int __mask) 1414 { 1415 return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask); 1416 } 1417 1418 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1419 _mm_shuffle_epi32 (__m128i __A, const int __mask) 1420 { 1421 return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask); 1422 } 1423 #else 1424 #define _mm_shufflehi_epi16(A, N) \ 1425 ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N))) 1426 #define _mm_shufflelo_epi16(A, N) \ 1427 ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N))) 1428 #define _mm_shuffle_epi32(A, N) \ 1429 ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N))) 1430 #endif 1431 1432 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1433 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 1434 { 1435 __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C); 1436 } 1437 1438 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1439 _mm_avg_epu8 (__m128i __A, __m128i __B) 1440 { 1441 return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); 1442 } 1443 1444 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1445 _mm_avg_epu16 (__m128i __A, __m128i __B) 1446 { 1447 return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); 1448 } 1449 1450 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1451 _mm_sad_epu8 (__m128i __A, __m128i __B) 1452 { 1453 return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); 1454 } 1455 1456 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1457 _mm_stream_si32 (int *__A, int __B) 1458 { 1459 __builtin_ia32_movnti (__A, __B); 1460 } 1461 1462 #ifdef __x86_64__ 1463 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1464 _mm_stream_si64 (long long int *__A, long long int __B) 1465 { 1466 __builtin_ia32_movnti64 (__A, __B); 1467 } 1468 #endif 1469 1470 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1471 _mm_stream_si128 (__m128i *__A, __m128i __B) 1472 { 1473 __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B); 1474 } 1475 1476 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1477 _mm_stream_pd (double *__A, __m128d __B) 1478 { 1479 __builtin_ia32_movntpd (__A, (__v2df)__B); 1480 } 1481 1482 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1483 _mm_clflush (void const *__A) 1484 { 1485 __builtin_ia32_clflush (__A); 1486 } 1487 1488 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1489 _mm_lfence (void) 1490 { 1491 __builtin_ia32_lfence (); 1492 } 1493 1494 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1495 _mm_mfence (void) 1496 { 1497 __builtin_ia32_mfence (); 1498 } 1499 1500 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1501 _mm_cvtsi32_si128 (int __A) 1502 { 1503 return _mm_set_epi32 (0, 0, 0, __A); 1504 } 1505 1506 #ifdef __x86_64__ 1507 /* Intel intrinsic. */ 1508 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1509 _mm_cvtsi64_si128 (long long __A) 1510 { 1511 return _mm_set_epi64x (0, __A); 1512 } 1513 1514 /* Microsoft intrinsic. */ 1515 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1516 _mm_cvtsi64x_si128 (long long __A) 1517 { 1518 return _mm_set_epi64x (0, __A); 1519 } 1520 #endif 1521 1522 /* Casts between various SP, DP, INT vector types. Note that these do no 1523 conversion of values, they just change the type. */ 1524 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1525 _mm_castpd_ps(__m128d __A) 1526 { 1527 return (__m128) __A; 1528 } 1529 1530 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1531 _mm_castpd_si128(__m128d __A) 1532 { 1533 return (__m128i) __A; 1534 } 1535 1536 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1537 _mm_castps_pd(__m128 __A) 1538 { 1539 return (__m128d) __A; 1540 } 1541 1542 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1543 _mm_castps_si128(__m128 __A) 1544 { 1545 return (__m128i) __A; 1546 } 1547 1548 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1549 _mm_castsi128_ps(__m128i __A) 1550 { 1551 return (__m128) __A; 1552 } 1553 1554 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1555 _mm_castsi128_pd(__m128i __A) 1556 { 1557 return (__m128d) __A; 1558 } 1559 1560 #ifdef __DISABLE_SSE2__ 1561 #undef __DISABLE_SSE2__ 1562 #pragma GCC pop_options 1563 #endif /* __DISABLE_SSE2__ */ 1564 1565 #endif /* _EMMINTRIN_H_INCLUDED */ 1566