1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since PowerPC target doesn't support native 64-bit vector type, we 18 typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which 19 works well for _si64 and some _pi32 operations. 20 21 For _pi16 and _pi8 operations, it's better to transfer __m64 into 22 128-bit PowerPC vector first. Power8 introduced direct register 23 move instructions which helps for more efficient implementation. 24 25 It's user's responsibility to determine if the results of such port 26 are acceptable or further changes are needed. Please note that much 27 code using Intel intrinsics CAN BE REWRITTEN in more portable and 28 efficient standard C or GNU C extensions with 64-bit scalar 29 operations, or 128-bit SSE/Altivec operations, which are more 30 recommended. */ 31 #error \ 32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 33 #endif 34 35 #ifndef _MMINTRIN_H_INCLUDED 36 #define _MMINTRIN_H_INCLUDED 37 38 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) 39 40 #include <altivec.h> 41 /* The Intel API is flexible enough that we must allow aliasing with other 42 vector types, and their scalar components. */ 43 typedef __attribute__((__aligned__(8))) unsigned long long __m64; 44 45 typedef __attribute__((__aligned__(8))) union { 46 __m64 as_m64; 47 char as_char[8]; 48 signed char as_signed_char[8]; 49 short as_short[4]; 50 int as_int[2]; 51 long long as_long_long; 52 float as_float[2]; 53 double as_double; 54 } __m64_union; 55 56 /* Empty the multimedia state. */ 57 extern __inline void 58 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 59 _mm_empty(void) { 60 /* nothing to do on PowerPC. */ 61 } 62 63 extern __inline void 64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 65 _m_empty(void) { 66 /* nothing to do on PowerPC. */ 67 } 68 69 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 70 extern __inline __m64 71 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 72 _mm_cvtsi32_si64(int __i) { 73 return (__m64)(unsigned int)__i; 74 } 75 76 extern __inline __m64 77 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 78 _m_from_int(int __i) { 79 return _mm_cvtsi32_si64(__i); 80 } 81 82 /* Convert the lower 32 bits of the __m64 object into an integer. */ 83 extern __inline int 84 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 85 _mm_cvtsi64_si32(__m64 __i) { 86 return ((int)__i); 87 } 88 89 extern __inline int 90 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 91 _m_to_int(__m64 __i) { 92 return _mm_cvtsi64_si32(__i); 93 } 94 95 /* Convert I to a __m64 object. */ 96 97 /* Intel intrinsic. */ 98 extern __inline __m64 99 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 100 _m_from_int64(long long __i) { 101 return (__m64)__i; 102 } 103 104 extern __inline __m64 105 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 106 _mm_cvtsi64_m64(long long __i) { 107 return (__m64)__i; 108 } 109 110 /* Microsoft intrinsic. */ 111 extern __inline __m64 112 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 113 _mm_cvtsi64x_si64(long long __i) { 114 return (__m64)__i; 115 } 116 117 extern __inline __m64 118 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 119 _mm_set_pi64x(long long __i) { 120 return (__m64)__i; 121 } 122 123 /* Convert the __m64 object to a 64bit integer. */ 124 125 /* Intel intrinsic. */ 126 extern __inline long long 127 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 128 _m_to_int64(__m64 __i) { 129 return (long long)__i; 130 } 131 132 extern __inline long long 133 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 134 _mm_cvtm64_si64(__m64 __i) { 135 return (long long)__i; 136 } 137 138 /* Microsoft intrinsic. */ 139 extern __inline long long 140 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 141 _mm_cvtsi64_si64x(__m64 __i) { 142 return (long long)__i; 143 } 144 145 #ifdef _ARCH_PWR8 146 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 147 the result, and the four 16-bit values from M2 into the upper four 8-bit 148 values of the result, all with signed saturation. */ 149 extern __inline __m64 150 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 151 _mm_packs_pi16(__m64 __m1, __m64 __m2) { 152 __vector signed short __vm1; 153 __vector signed char __vresult; 154 155 __vm1 = (__vector signed short)(__vector unsigned long long) 156 #ifdef __LITTLE_ENDIAN__ 157 {__m1, __m2}; 158 #else 159 {__m2, __m1}; 160 #endif 161 __vresult = vec_packs(__vm1, __vm1); 162 return (__m64)((__vector long long)__vresult)[0]; 163 } 164 165 extern __inline __m64 166 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 167 _m_packsswb(__m64 __m1, __m64 __m2) { 168 return _mm_packs_pi16(__m1, __m2); 169 } 170 171 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 172 the result, and the two 32-bit values from M2 into the upper two 16-bit 173 values of the result, all with signed saturation. */ 174 extern __inline __m64 175 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 176 _mm_packs_pi32(__m64 __m1, __m64 __m2) { 177 __vector signed int __vm1; 178 __vector signed short __vresult; 179 180 __vm1 = (__vector signed int)(__vector unsigned long long) 181 #ifdef __LITTLE_ENDIAN__ 182 {__m1, __m2}; 183 #else 184 {__m2, __m1}; 185 #endif 186 __vresult = vec_packs(__vm1, __vm1); 187 return (__m64)((__vector long long)__vresult)[0]; 188 } 189 190 extern __inline __m64 191 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 192 _m_packssdw(__m64 __m1, __m64 __m2) { 193 return _mm_packs_pi32(__m1, __m2); 194 } 195 196 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 197 the result, and the four 16-bit values from M2 into the upper four 8-bit 198 values of the result, all with unsigned saturation. */ 199 extern __inline __m64 200 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 201 _mm_packs_pu16(__m64 __m1, __m64 __m2) { 202 __vector unsigned char __r; 203 __vector signed short __vm1 = (__vector signed short)(__vector long long) 204 #ifdef __LITTLE_ENDIAN__ 205 {__m1, __m2}; 206 #else 207 {__m2, __m1}; 208 #endif 209 const __vector signed short __zero = {0}; 210 __vector __bool short __select = vec_cmplt(__vm1, __zero); 211 __r = 212 vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1); 213 __vector __bool char __packsel = vec_pack(__select, __select); 214 __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel); 215 return (__m64)((__vector long long)__r)[0]; 216 } 217 218 extern __inline __m64 219 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 220 _m_packuswb(__m64 __m1, __m64 __m2) { 221 return _mm_packs_pu16(__m1, __m2); 222 } 223 #endif /* end ARCH_PWR8 */ 224 225 /* Interleave the four 8-bit values from the high half of M1 with the four 226 8-bit values from the high half of M2. */ 227 extern __inline __m64 228 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 229 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { 230 #if _ARCH_PWR8 231 __vector unsigned char __a, __b, __c; 232 233 __a = (__vector unsigned char)vec_splats(__m1); 234 __b = (__vector unsigned char)vec_splats(__m2); 235 __c = vec_mergel(__a, __b); 236 return (__m64)((__vector long long)__c)[1]; 237 #else 238 __m64_union __mu1, __mu2, __res; 239 240 __mu1.as_m64 = __m1; 241 __mu2.as_m64 = __m2; 242 243 __res.as_char[0] = __mu1.as_char[4]; 244 __res.as_char[1] = __mu2.as_char[4]; 245 __res.as_char[2] = __mu1.as_char[5]; 246 __res.as_char[3] = __mu2.as_char[5]; 247 __res.as_char[4] = __mu1.as_char[6]; 248 __res.as_char[5] = __mu2.as_char[6]; 249 __res.as_char[6] = __mu1.as_char[7]; 250 __res.as_char[7] = __mu2.as_char[7]; 251 252 return (__m64)__res.as_m64; 253 #endif 254 } 255 256 extern __inline __m64 257 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 258 _m_punpckhbw(__m64 __m1, __m64 __m2) { 259 return _mm_unpackhi_pi8(__m1, __m2); 260 } 261 262 /* Interleave the two 16-bit values from the high half of M1 with the two 263 16-bit values from the high half of M2. */ 264 extern __inline __m64 265 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 266 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { 267 __m64_union __mu1, __mu2, __res; 268 269 __mu1.as_m64 = __m1; 270 __mu2.as_m64 = __m2; 271 272 __res.as_short[0] = __mu1.as_short[2]; 273 __res.as_short[1] = __mu2.as_short[2]; 274 __res.as_short[2] = __mu1.as_short[3]; 275 __res.as_short[3] = __mu2.as_short[3]; 276 277 return (__m64)__res.as_m64; 278 } 279 280 extern __inline __m64 281 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 282 _m_punpckhwd(__m64 __m1, __m64 __m2) { 283 return _mm_unpackhi_pi16(__m1, __m2); 284 } 285 /* Interleave the 32-bit value from the high half of M1 with the 32-bit 286 value from the high half of M2. */ 287 extern __inline __m64 288 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 289 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { 290 __m64_union __mu1, __mu2, __res; 291 292 __mu1.as_m64 = __m1; 293 __mu2.as_m64 = __m2; 294 295 __res.as_int[0] = __mu1.as_int[1]; 296 __res.as_int[1] = __mu2.as_int[1]; 297 298 return (__m64)__res.as_m64; 299 } 300 301 extern __inline __m64 302 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 303 _m_punpckhdq(__m64 __m1, __m64 __m2) { 304 return _mm_unpackhi_pi32(__m1, __m2); 305 } 306 /* Interleave the four 8-bit values from the low half of M1 with the four 307 8-bit values from the low half of M2. */ 308 extern __inline __m64 309 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 310 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { 311 #if _ARCH_PWR8 312 __vector unsigned char __a, __b, __c; 313 314 __a = (__vector unsigned char)vec_splats(__m1); 315 __b = (__vector unsigned char)vec_splats(__m2); 316 __c = vec_mergel(__a, __b); 317 return (__m64)((__vector long long)__c)[0]; 318 #else 319 __m64_union __mu1, __mu2, __res; 320 321 __mu1.as_m64 = __m1; 322 __mu2.as_m64 = __m2; 323 324 __res.as_char[0] = __mu1.as_char[0]; 325 __res.as_char[1] = __mu2.as_char[0]; 326 __res.as_char[2] = __mu1.as_char[1]; 327 __res.as_char[3] = __mu2.as_char[1]; 328 __res.as_char[4] = __mu1.as_char[2]; 329 __res.as_char[5] = __mu2.as_char[2]; 330 __res.as_char[6] = __mu1.as_char[3]; 331 __res.as_char[7] = __mu2.as_char[3]; 332 333 return (__m64)__res.as_m64; 334 #endif 335 } 336 337 extern __inline __m64 338 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 339 _m_punpcklbw(__m64 __m1, __m64 __m2) { 340 return _mm_unpacklo_pi8(__m1, __m2); 341 } 342 /* Interleave the two 16-bit values from the low half of M1 with the two 343 16-bit values from the low half of M2. */ 344 extern __inline __m64 345 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 346 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { 347 __m64_union __mu1, __mu2, __res; 348 349 __mu1.as_m64 = __m1; 350 __mu2.as_m64 = __m2; 351 352 __res.as_short[0] = __mu1.as_short[0]; 353 __res.as_short[1] = __mu2.as_short[0]; 354 __res.as_short[2] = __mu1.as_short[1]; 355 __res.as_short[3] = __mu2.as_short[1]; 356 357 return (__m64)__res.as_m64; 358 } 359 360 extern __inline __m64 361 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 362 _m_punpcklwd(__m64 __m1, __m64 __m2) { 363 return _mm_unpacklo_pi16(__m1, __m2); 364 } 365 366 /* Interleave the 32-bit value from the low half of M1 with the 32-bit 367 value from the low half of M2. */ 368 extern __inline __m64 369 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 370 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { 371 __m64_union __mu1, __mu2, __res; 372 373 __mu1.as_m64 = __m1; 374 __mu2.as_m64 = __m2; 375 376 __res.as_int[0] = __mu1.as_int[0]; 377 __res.as_int[1] = __mu2.as_int[0]; 378 379 return (__m64)__res.as_m64; 380 } 381 382 extern __inline __m64 383 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 384 _m_punpckldq(__m64 __m1, __m64 __m2) { 385 return _mm_unpacklo_pi32(__m1, __m2); 386 } 387 388 /* Add the 8-bit values in M1 to the 8-bit values in M2. */ 389 extern __inline __m64 390 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 391 _mm_add_pi8(__m64 __m1, __m64 __m2) { 392 #if _ARCH_PWR8 393 __vector signed char __a, __b, __c; 394 395 __a = (__vector signed char)vec_splats(__m1); 396 __b = (__vector signed char)vec_splats(__m2); 397 __c = vec_add(__a, __b); 398 return (__m64)((__vector long long)__c)[0]; 399 #else 400 __m64_union __mu1, __mu2, __res; 401 402 __mu1.as_m64 = __m1; 403 __mu2.as_m64 = __m2; 404 405 __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0]; 406 __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1]; 407 __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2]; 408 __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3]; 409 __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4]; 410 __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5]; 411 __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6]; 412 __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7]; 413 414 return (__m64)__res.as_m64; 415 #endif 416 } 417 418 extern __inline __m64 419 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 420 _m_paddb(__m64 __m1, __m64 __m2) { 421 return _mm_add_pi8(__m1, __m2); 422 } 423 424 /* Add the 16-bit values in M1 to the 16-bit values in M2. */ 425 extern __inline __m64 426 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 427 _mm_add_pi16(__m64 __m1, __m64 __m2) { 428 #if _ARCH_PWR8 429 __vector signed short __a, __b, __c; 430 431 __a = (__vector signed short)vec_splats(__m1); 432 __b = (__vector signed short)vec_splats(__m2); 433 __c = vec_add(__a, __b); 434 return (__m64)((__vector long long)__c)[0]; 435 #else 436 __m64_union __mu1, __mu2, __res; 437 438 __mu1.as_m64 = __m1; 439 __mu2.as_m64 = __m2; 440 441 __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0]; 442 __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1]; 443 __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2]; 444 __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3]; 445 446 return (__m64)__res.as_m64; 447 #endif 448 } 449 450 extern __inline __m64 451 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 452 _m_paddw(__m64 __m1, __m64 __m2) { 453 return _mm_add_pi16(__m1, __m2); 454 } 455 456 /* Add the 32-bit values in M1 to the 32-bit values in M2. */ 457 extern __inline __m64 458 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 459 _mm_add_pi32(__m64 __m1, __m64 __m2) { 460 #if _ARCH_PWR9 461 __vector signed int __a, __b, __c; 462 463 __a = (__vector signed int)vec_splats(__m1); 464 __b = (__vector signed int)vec_splats(__m2); 465 __c = vec_add(__a, __b); 466 return (__m64)((__vector long long)__c)[0]; 467 #else 468 __m64_union __mu1, __mu2, __res; 469 470 __mu1.as_m64 = __m1; 471 __mu2.as_m64 = __m2; 472 473 __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0]; 474 __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1]; 475 476 return (__m64)__res.as_m64; 477 #endif 478 } 479 480 extern __inline __m64 481 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 482 _m_paddd(__m64 __m1, __m64 __m2) { 483 return _mm_add_pi32(__m1, __m2); 484 } 485 486 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 487 extern __inline __m64 488 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 489 _mm_sub_pi8(__m64 __m1, __m64 __m2) { 490 #if _ARCH_PWR8 491 __vector signed char __a, __b, __c; 492 493 __a = (__vector signed char)vec_splats(__m1); 494 __b = (__vector signed char)vec_splats(__m2); 495 __c = vec_sub(__a, __b); 496 return (__m64)((__vector long long)__c)[0]; 497 #else 498 __m64_union __mu1, __mu2, __res; 499 500 __mu1.as_m64 = __m1; 501 __mu2.as_m64 = __m2; 502 503 __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0]; 504 __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1]; 505 __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2]; 506 __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3]; 507 __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4]; 508 __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5]; 509 __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6]; 510 __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7]; 511 512 return (__m64)__res.as_m64; 513 #endif 514 } 515 516 extern __inline __m64 517 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 518 _m_psubb(__m64 __m1, __m64 __m2) { 519 return _mm_sub_pi8(__m1, __m2); 520 } 521 522 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 523 extern __inline __m64 524 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 525 _mm_sub_pi16(__m64 __m1, __m64 __m2) { 526 #if _ARCH_PWR8 527 __vector signed short __a, __b, __c; 528 529 __a = (__vector signed short)vec_splats(__m1); 530 __b = (__vector signed short)vec_splats(__m2); 531 __c = vec_sub(__a, __b); 532 return (__m64)((__vector long long)__c)[0]; 533 #else 534 __m64_union __mu1, __mu2, __res; 535 536 __mu1.as_m64 = __m1; 537 __mu2.as_m64 = __m2; 538 539 __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0]; 540 __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1]; 541 __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2]; 542 __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3]; 543 544 return (__m64)__res.as_m64; 545 #endif 546 } 547 548 extern __inline __m64 549 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 550 _m_psubw(__m64 __m1, __m64 __m2) { 551 return _mm_sub_pi16(__m1, __m2); 552 } 553 554 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 555 extern __inline __m64 556 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 557 _mm_sub_pi32(__m64 __m1, __m64 __m2) { 558 #if _ARCH_PWR9 559 __vector signed int __a, __b, __c; 560 561 __a = (__vector signed int)vec_splats(__m1); 562 __b = (__vector signed int)vec_splats(__m2); 563 __c = vec_sub(__a, __b); 564 return (__m64)((__vector long long)__c)[0]; 565 #else 566 __m64_union __mu1, __mu2, __res; 567 568 __mu1.as_m64 = __m1; 569 __mu2.as_m64 = __m2; 570 571 __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0]; 572 __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1]; 573 574 return (__m64)__res.as_m64; 575 #endif 576 } 577 578 extern __inline __m64 579 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 580 _m_psubd(__m64 __m1, __m64 __m2) { 581 return _mm_sub_pi32(__m1, __m2); 582 } 583 584 extern __inline __m64 585 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 586 _mm_add_si64(__m64 __m1, __m64 __m2) { 587 return (__m1 + __m2); 588 } 589 590 extern __inline __m64 591 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 592 _mm_sub_si64(__m64 __m1, __m64 __m2) { 593 return (__m1 - __m2); 594 } 595 596 /* Shift the 64-bit value in M left by COUNT. */ 597 extern __inline __m64 598 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 599 _mm_sll_si64(__m64 __m, __m64 __count) { 600 return (__m << __count); 601 } 602 603 extern __inline __m64 604 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 605 _m_psllq(__m64 __m, __m64 __count) { 606 return _mm_sll_si64(__m, __count); 607 } 608 609 extern __inline __m64 610 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 611 _mm_slli_si64(__m64 __m, const int __count) { 612 return (__m << __count); 613 } 614 615 extern __inline __m64 616 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 617 _m_psllqi(__m64 __m, const int __count) { 618 return _mm_slli_si64(__m, __count); 619 } 620 621 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 622 extern __inline __m64 623 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 624 _mm_srl_si64(__m64 __m, __m64 __count) { 625 return (__m >> __count); 626 } 627 628 extern __inline __m64 629 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 630 _m_psrlq(__m64 __m, __m64 __count) { 631 return _mm_srl_si64(__m, __count); 632 } 633 634 extern __inline __m64 635 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 636 _mm_srli_si64(__m64 __m, const int __count) { 637 return (__m >> __count); 638 } 639 640 extern __inline __m64 641 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 642 _m_psrlqi(__m64 __m, const int __count) { 643 return _mm_srli_si64(__m, __count); 644 } 645 646 /* Bit-wise AND the 64-bit values in M1 and M2. */ 647 extern __inline __m64 648 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 649 _mm_and_si64(__m64 __m1, __m64 __m2) { 650 return (__m1 & __m2); 651 } 652 653 extern __inline __m64 654 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 655 _m_pand(__m64 __m1, __m64 __m2) { 656 return _mm_and_si64(__m1, __m2); 657 } 658 659 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 660 64-bit value in M2. */ 661 extern __inline __m64 662 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 663 _mm_andnot_si64(__m64 __m1, __m64 __m2) { 664 return (~__m1 & __m2); 665 } 666 667 extern __inline __m64 668 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 669 _m_pandn(__m64 __m1, __m64 __m2) { 670 return _mm_andnot_si64(__m1, __m2); 671 } 672 673 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 674 extern __inline __m64 675 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 676 _mm_or_si64(__m64 __m1, __m64 __m2) { 677 return (__m1 | __m2); 678 } 679 680 extern __inline __m64 681 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 682 _m_por(__m64 __m1, __m64 __m2) { 683 return _mm_or_si64(__m1, __m2); 684 } 685 686 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 687 extern __inline __m64 688 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 689 _mm_xor_si64(__m64 __m1, __m64 __m2) { 690 return (__m1 ^ __m2); 691 } 692 693 extern __inline __m64 694 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 695 _m_pxor(__m64 __m1, __m64 __m2) { 696 return _mm_xor_si64(__m1, __m2); 697 } 698 699 /* Creates a 64-bit zero. */ 700 extern __inline __m64 701 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 702 _mm_setzero_si64(void) { 703 return (__m64)0; 704 } 705 706 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the 707 test is true and zero if false. */ 708 extern __inline __m64 709 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 710 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { 711 #if defined(_ARCH_PWR6) && defined(__powerpc64__) 712 __m64 __res; 713 __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :); 714 return (__res); 715 #else 716 __m64_union __mu1, __mu2, __res; 717 718 __mu1.as_m64 = __m1; 719 __mu2.as_m64 = __m2; 720 721 __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0; 722 __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0; 723 __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0; 724 __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0; 725 __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0; 726 __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0; 727 __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0; 728 __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0; 729 730 return (__m64)__res.as_m64; 731 #endif 732 } 733 734 extern __inline __m64 735 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 736 _m_pcmpeqb(__m64 __m1, __m64 __m2) { 737 return _mm_cmpeq_pi8(__m1, __m2); 738 } 739 740 extern __inline __m64 741 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 742 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { 743 #if _ARCH_PWR8 744 __vector signed char __a, __b, __c; 745 746 __a = (__vector signed char)vec_splats(__m1); 747 __b = (__vector signed char)vec_splats(__m2); 748 __c = (__vector signed char)vec_cmpgt(__a, __b); 749 return (__m64)((__vector long long)__c)[0]; 750 #else 751 __m64_union __mu1, __mu2, __res; 752 753 __mu1.as_m64 = __m1; 754 __mu2.as_m64 = __m2; 755 756 __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0; 757 __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0; 758 __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0; 759 __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0; 760 __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0; 761 __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0; 762 __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0; 763 __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0; 764 765 return (__m64)__res.as_m64; 766 #endif 767 } 768 769 extern __inline __m64 770 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 771 _m_pcmpgtb(__m64 __m1, __m64 __m2) { 772 return _mm_cmpgt_pi8(__m1, __m2); 773 } 774 775 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if 776 the test is true and zero if false. */ 777 extern __inline __m64 778 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 779 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { 780 #if _ARCH_PWR8 781 __vector signed short __a, __b, __c; 782 783 __a = (__vector signed short)vec_splats(__m1); 784 __b = (__vector signed short)vec_splats(__m2); 785 __c = (__vector signed short)vec_cmpeq(__a, __b); 786 return (__m64)((__vector long long)__c)[0]; 787 #else 788 __m64_union __mu1, __mu2, __res; 789 790 __mu1.as_m64 = __m1; 791 __mu2.as_m64 = __m2; 792 793 __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0; 794 __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0; 795 __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0; 796 __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0; 797 798 return (__m64)__res.as_m64; 799 #endif 800 } 801 802 extern __inline __m64 803 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 804 _m_pcmpeqw(__m64 __m1, __m64 __m2) { 805 return _mm_cmpeq_pi16(__m1, __m2); 806 } 807 808 extern __inline __m64 809 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 810 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { 811 #if _ARCH_PWR8 812 __vector signed short __a, __b, __c; 813 814 __a = (__vector signed short)vec_splats(__m1); 815 __b = (__vector signed short)vec_splats(__m2); 816 __c = (__vector signed short)vec_cmpgt(__a, __b); 817 return (__m64)((__vector long long)__c)[0]; 818 #else 819 __m64_union __mu1, __mu2, __res; 820 821 __mu1.as_m64 = __m1; 822 __mu2.as_m64 = __m2; 823 824 __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0; 825 __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0; 826 __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0; 827 __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0; 828 829 return (__m64)__res.as_m64; 830 #endif 831 } 832 833 extern __inline __m64 834 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 835 _m_pcmpgtw(__m64 __m1, __m64 __m2) { 836 return _mm_cmpgt_pi16(__m1, __m2); 837 } 838 839 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 840 the test is true and zero if false. */ 841 extern __inline __m64 842 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 843 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { 844 #if _ARCH_PWR9 845 __vector signed int __a, __b, __c; 846 847 __a = (__vector signed int)vec_splats(__m1); 848 __b = (__vector signed int)vec_splats(__m2); 849 __c = (__vector signed int)vec_cmpeq(__a, __b); 850 return (__m64)((__vector long long)__c)[0]; 851 #else 852 __m64_union __mu1, __mu2, __res; 853 854 __mu1.as_m64 = __m1; 855 __mu2.as_m64 = __m2; 856 857 __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0; 858 __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0; 859 860 return (__m64)__res.as_m64; 861 #endif 862 } 863 864 extern __inline __m64 865 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 866 _m_pcmpeqd(__m64 __m1, __m64 __m2) { 867 return _mm_cmpeq_pi32(__m1, __m2); 868 } 869 870 extern __inline __m64 871 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 872 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { 873 #if _ARCH_PWR9 874 __vector signed int __a, __b, __c; 875 876 __a = (__vector signed int)vec_splats(__m1); 877 __b = (__vector signed int)vec_splats(__m2); 878 __c = (__vector signed int)vec_cmpgt(__a, __b); 879 return (__m64)((__vector long long)__c)[0]; 880 #else 881 __m64_union __mu1, __mu2, __res; 882 883 __mu1.as_m64 = __m1; 884 __mu2.as_m64 = __m2; 885 886 __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0; 887 __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0; 888 889 return (__m64)__res.as_m64; 890 #endif 891 } 892 893 extern __inline __m64 894 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 895 _m_pcmpgtd(__m64 __m1, __m64 __m2) { 896 return _mm_cmpgt_pi32(__m1, __m2); 897 } 898 899 #if _ARCH_PWR8 900 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 901 saturated arithmetic. */ 902 extern __inline __m64 903 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 904 _mm_adds_pi8(__m64 __m1, __m64 __m2) { 905 __vector signed char __a, __b, __c; 906 907 __a = (__vector signed char)vec_splats(__m1); 908 __b = (__vector signed char)vec_splats(__m2); 909 __c = vec_adds(__a, __b); 910 return (__m64)((__vector long long)__c)[0]; 911 } 912 913 extern __inline __m64 914 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 915 _m_paddsb(__m64 __m1, __m64 __m2) { 916 return _mm_adds_pi8(__m1, __m2); 917 } 918 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 919 saturated arithmetic. */ 920 extern __inline __m64 921 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 922 _mm_adds_pi16(__m64 __m1, __m64 __m2) { 923 __vector signed short __a, __b, __c; 924 925 __a = (__vector signed short)vec_splats(__m1); 926 __b = (__vector signed short)vec_splats(__m2); 927 __c = vec_adds(__a, __b); 928 return (__m64)((__vector long long)__c)[0]; 929 } 930 931 extern __inline __m64 932 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 933 _m_paddsw(__m64 __m1, __m64 __m2) { 934 return _mm_adds_pi16(__m1, __m2); 935 } 936 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 937 saturated arithmetic. */ 938 extern __inline __m64 939 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 940 _mm_adds_pu8(__m64 __m1, __m64 __m2) { 941 __vector unsigned char __a, __b, __c; 942 943 __a = (__vector unsigned char)vec_splats(__m1); 944 __b = (__vector unsigned char)vec_splats(__m2); 945 __c = vec_adds(__a, __b); 946 return (__m64)((__vector long long)__c)[0]; 947 } 948 949 extern __inline __m64 950 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 951 _m_paddusb(__m64 __m1, __m64 __m2) { 952 return _mm_adds_pu8(__m1, __m2); 953 } 954 955 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 956 saturated arithmetic. */ 957 extern __inline __m64 958 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 959 _mm_adds_pu16(__m64 __m1, __m64 __m2) { 960 __vector unsigned short __a, __b, __c; 961 962 __a = (__vector unsigned short)vec_splats(__m1); 963 __b = (__vector unsigned short)vec_splats(__m2); 964 __c = vec_adds(__a, __b); 965 return (__m64)((__vector long long)__c)[0]; 966 } 967 968 extern __inline __m64 969 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 970 _m_paddusw(__m64 __m1, __m64 __m2) { 971 return _mm_adds_pu16(__m1, __m2); 972 } 973 974 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 975 saturating arithmetic. */ 976 extern __inline __m64 977 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 978 _mm_subs_pi8(__m64 __m1, __m64 __m2) { 979 __vector signed char __a, __b, __c; 980 981 __a = (__vector signed char)vec_splats(__m1); 982 __b = (__vector signed char)vec_splats(__m2); 983 __c = vec_subs(__a, __b); 984 return (__m64)((__vector long long)__c)[0]; 985 } 986 987 extern __inline __m64 988 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 989 _m_psubsb(__m64 __m1, __m64 __m2) { 990 return _mm_subs_pi8(__m1, __m2); 991 } 992 993 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 994 signed saturating arithmetic. */ 995 extern __inline __m64 996 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 997 _mm_subs_pi16(__m64 __m1, __m64 __m2) { 998 __vector signed short __a, __b, __c; 999 1000 __a = (__vector signed short)vec_splats(__m1); 1001 __b = (__vector signed short)vec_splats(__m2); 1002 __c = vec_subs(__a, __b); 1003 return (__m64)((__vector long long)__c)[0]; 1004 } 1005 1006 extern __inline __m64 1007 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1008 _m_psubsw(__m64 __m1, __m64 __m2) { 1009 return _mm_subs_pi16(__m1, __m2); 1010 } 1011 1012 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 1013 unsigned saturating arithmetic. */ 1014 extern __inline __m64 1015 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1016 _mm_subs_pu8(__m64 __m1, __m64 __m2) { 1017 __vector unsigned char __a, __b, __c; 1018 1019 __a = (__vector unsigned char)vec_splats(__m1); 1020 __b = (__vector unsigned char)vec_splats(__m2); 1021 __c = vec_subs(__a, __b); 1022 return (__m64)((__vector long long)__c)[0]; 1023 } 1024 1025 extern __inline __m64 1026 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1027 _m_psubusb(__m64 __m1, __m64 __m2) { 1028 return _mm_subs_pu8(__m1, __m2); 1029 } 1030 1031 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1032 unsigned saturating arithmetic. */ 1033 extern __inline __m64 1034 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1035 _mm_subs_pu16(__m64 __m1, __m64 __m2) { 1036 __vector unsigned short __a, __b, __c; 1037 1038 __a = (__vector unsigned short)vec_splats(__m1); 1039 __b = (__vector unsigned short)vec_splats(__m2); 1040 __c = vec_subs(__a, __b); 1041 return (__m64)((__vector long long)__c)[0]; 1042 } 1043 1044 extern __inline __m64 1045 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1046 _m_psubusw(__m64 __m1, __m64 __m2) { 1047 return _mm_subs_pu16(__m1, __m2); 1048 } 1049 1050 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 1051 four 32-bit intermediate results, which are then summed by pairs to 1052 produce two 32-bit results. */ 1053 extern __inline __m64 1054 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1055 _mm_madd_pi16(__m64 __m1, __m64 __m2) { 1056 __vector signed short __a, __b; 1057 __vector signed int __c; 1058 __vector signed int __zero = {0, 0, 0, 0}; 1059 1060 __a = (__vector signed short)vec_splats(__m1); 1061 __b = (__vector signed short)vec_splats(__m2); 1062 __c = vec_vmsumshm(__a, __b, __zero); 1063 return (__m64)((__vector long long)__c)[0]; 1064 } 1065 1066 extern __inline __m64 1067 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1068 _m_pmaddwd(__m64 __m1, __m64 __m2) { 1069 return _mm_madd_pi16(__m1, __m2); 1070 } 1071 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 1072 M2 and produce the high 16 bits of the 32-bit results. */ 1073 extern __inline __m64 1074 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1075 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { 1076 __vector signed short __a, __b; 1077 __vector signed short __c; 1078 __vector signed int __w0, __w1; 1079 __vector unsigned char __xform1 = { 1080 #ifdef __LITTLE_ENDIAN__ 1081 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1082 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1083 #else 1084 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00, 1085 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 1086 #endif 1087 }; 1088 1089 __a = (__vector signed short)vec_splats(__m1); 1090 __b = (__vector signed short)vec_splats(__m2); 1091 1092 __w0 = vec_vmulesh(__a, __b); 1093 __w1 = vec_vmulosh(__a, __b); 1094 __c = (__vector signed short)vec_perm(__w0, __w1, __xform1); 1095 1096 return (__m64)((__vector long long)__c)[0]; 1097 } 1098 1099 extern __inline __m64 1100 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1101 _m_pmulhw(__m64 __m1, __m64 __m2) { 1102 return _mm_mulhi_pi16(__m1, __m2); 1103 } 1104 1105 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 1106 the low 16 bits of the results. */ 1107 extern __inline __m64 1108 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1109 _mm_mullo_pi16(__m64 __m1, __m64 __m2) { 1110 __vector signed short __a, __b, __c; 1111 1112 __a = (__vector signed short)vec_splats(__m1); 1113 __b = (__vector signed short)vec_splats(__m2); 1114 __c = __a * __b; 1115 return (__m64)((__vector long long)__c)[0]; 1116 } 1117 1118 extern __inline __m64 1119 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1120 _m_pmullw(__m64 __m1, __m64 __m2) { 1121 return _mm_mullo_pi16(__m1, __m2); 1122 } 1123 1124 /* Shift four 16-bit values in M left by COUNT. */ 1125 extern __inline __m64 1126 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1127 _mm_sll_pi16(__m64 __m, __m64 __count) { 1128 __vector signed short __r; 1129 __vector unsigned short __c; 1130 1131 if (__count <= 15) { 1132 __r = (__vector signed short)vec_splats(__m); 1133 __c = (__vector unsigned short)vec_splats((unsigned short)__count); 1134 __r = vec_sl(__r, (__vector unsigned short)__c); 1135 return (__m64)((__vector long long)__r)[0]; 1136 } else 1137 return (0); 1138 } 1139 1140 extern __inline __m64 1141 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1142 _m_psllw(__m64 __m, __m64 __count) { 1143 return _mm_sll_pi16(__m, __count); 1144 } 1145 1146 extern __inline __m64 1147 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1148 _mm_slli_pi16(__m64 __m, int __count) { 1149 /* Promote int to long then invoke mm_sll_pi16. */ 1150 return _mm_sll_pi16(__m, __count); 1151 } 1152 1153 extern __inline __m64 1154 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1155 _m_psllwi(__m64 __m, int __count) { 1156 return _mm_slli_pi16(__m, __count); 1157 } 1158 1159 /* Shift two 32-bit values in M left by COUNT. */ 1160 extern __inline __m64 1161 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1162 _mm_sll_pi32(__m64 __m, __m64 __count) { 1163 __m64_union __res; 1164 1165 __res.as_m64 = __m; 1166 1167 __res.as_int[0] = __res.as_int[0] << __count; 1168 __res.as_int[1] = __res.as_int[1] << __count; 1169 return (__res.as_m64); 1170 } 1171 1172 extern __inline __m64 1173 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1174 _m_pslld(__m64 __m, __m64 __count) { 1175 return _mm_sll_pi32(__m, __count); 1176 } 1177 1178 extern __inline __m64 1179 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1180 _mm_slli_pi32(__m64 __m, int __count) { 1181 /* Promote int to long then invoke mm_sll_pi32. */ 1182 return _mm_sll_pi32(__m, __count); 1183 } 1184 1185 extern __inline __m64 1186 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1187 _m_pslldi(__m64 __m, int __count) { 1188 return _mm_slli_pi32(__m, __count); 1189 } 1190 1191 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 1192 extern __inline __m64 1193 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1194 _mm_sra_pi16(__m64 __m, __m64 __count) { 1195 __vector signed short __r; 1196 __vector unsigned short __c; 1197 1198 if (__count <= 15) { 1199 __r = (__vector signed short)vec_splats(__m); 1200 __c = (__vector unsigned short)vec_splats((unsigned short)__count); 1201 __r = vec_sra(__r, (__vector unsigned short)__c); 1202 return (__m64)((__vector long long)__r)[0]; 1203 } else 1204 return (0); 1205 } 1206 1207 extern __inline __m64 1208 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1209 _m_psraw(__m64 __m, __m64 __count) { 1210 return _mm_sra_pi16(__m, __count); 1211 } 1212 1213 extern __inline __m64 1214 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1215 _mm_srai_pi16(__m64 __m, int __count) { 1216 /* Promote int to long then invoke mm_sra_pi32. */ 1217 return _mm_sra_pi16(__m, __count); 1218 } 1219 1220 extern __inline __m64 1221 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1222 _m_psrawi(__m64 __m, int __count) { 1223 return _mm_srai_pi16(__m, __count); 1224 } 1225 1226 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 1227 extern __inline __m64 1228 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1229 _mm_sra_pi32(__m64 __m, __m64 __count) { 1230 __m64_union __res; 1231 1232 __res.as_m64 = __m; 1233 1234 __res.as_int[0] = __res.as_int[0] >> __count; 1235 __res.as_int[1] = __res.as_int[1] >> __count; 1236 return (__res.as_m64); 1237 } 1238 1239 extern __inline __m64 1240 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1241 _m_psrad(__m64 __m, __m64 __count) { 1242 return _mm_sra_pi32(__m, __count); 1243 } 1244 1245 extern __inline __m64 1246 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1247 _mm_srai_pi32(__m64 __m, int __count) { 1248 /* Promote int to long then invoke mm_sra_pi32. */ 1249 return _mm_sra_pi32(__m, __count); 1250 } 1251 1252 extern __inline __m64 1253 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1254 _m_psradi(__m64 __m, int __count) { 1255 return _mm_srai_pi32(__m, __count); 1256 } 1257 1258 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 1259 extern __inline __m64 1260 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1261 _mm_srl_pi16(__m64 __m, __m64 __count) { 1262 __vector unsigned short __r; 1263 __vector unsigned short __c; 1264 1265 if (__count <= 15) { 1266 __r = (__vector unsigned short)vec_splats(__m); 1267 __c = (__vector unsigned short)vec_splats((unsigned short)__count); 1268 __r = vec_sr(__r, (__vector unsigned short)__c); 1269 return (__m64)((__vector long long)__r)[0]; 1270 } else 1271 return (0); 1272 } 1273 1274 extern __inline __m64 1275 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1276 _m_psrlw(__m64 __m, __m64 __count) { 1277 return _mm_srl_pi16(__m, __count); 1278 } 1279 1280 extern __inline __m64 1281 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1282 _mm_srli_pi16(__m64 __m, int __count) { 1283 /* Promote int to long then invoke mm_sra_pi32. */ 1284 return _mm_srl_pi16(__m, __count); 1285 } 1286 1287 extern __inline __m64 1288 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1289 _m_psrlwi(__m64 __m, int __count) { 1290 return _mm_srli_pi16(__m, __count); 1291 } 1292 1293 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 1294 extern __inline __m64 1295 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1296 _mm_srl_pi32(__m64 __m, __m64 __count) { 1297 __m64_union __res; 1298 1299 __res.as_m64 = __m; 1300 1301 __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count; 1302 __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count; 1303 return (__res.as_m64); 1304 } 1305 1306 extern __inline __m64 1307 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1308 _m_psrld(__m64 __m, __m64 __count) { 1309 return _mm_srl_pi32(__m, __count); 1310 } 1311 1312 extern __inline __m64 1313 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1314 _mm_srli_pi32(__m64 __m, int __count) { 1315 /* Promote int to long then invoke mm_srl_pi32. */ 1316 return _mm_srl_pi32(__m, __count); 1317 } 1318 1319 extern __inline __m64 1320 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1321 _m_psrldi(__m64 __m, int __count) { 1322 return _mm_srli_pi32(__m, __count); 1323 } 1324 #endif /* _ARCH_PWR8 */ 1325 1326 /* Creates a vector of two 32-bit values; I0 is least significant. */ 1327 extern __inline __m64 1328 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1329 _mm_set_pi32(int __i1, int __i0) { 1330 __m64_union __res; 1331 1332 __res.as_int[0] = __i0; 1333 __res.as_int[1] = __i1; 1334 return (__res.as_m64); 1335 } 1336 1337 /* Creates a vector of four 16-bit values; W0 is least significant. */ 1338 extern __inline __m64 1339 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1340 _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { 1341 __m64_union __res; 1342 1343 __res.as_short[0] = __w0; 1344 __res.as_short[1] = __w1; 1345 __res.as_short[2] = __w2; 1346 __res.as_short[3] = __w3; 1347 return (__res.as_m64); 1348 } 1349 1350 /* Creates a vector of eight 8-bit values; B0 is least significant. */ 1351 extern __inline __m64 1352 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1353 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, 1354 char __b2, char __b1, char __b0) { 1355 __m64_union __res; 1356 1357 __res.as_char[0] = __b0; 1358 __res.as_char[1] = __b1; 1359 __res.as_char[2] = __b2; 1360 __res.as_char[3] = __b3; 1361 __res.as_char[4] = __b4; 1362 __res.as_char[5] = __b5; 1363 __res.as_char[6] = __b6; 1364 __res.as_char[7] = __b7; 1365 return (__res.as_m64); 1366 } 1367 1368 /* Similar, but with the arguments in reverse order. */ 1369 extern __inline __m64 1370 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1371 _mm_setr_pi32(int __i0, int __i1) { 1372 __m64_union __res; 1373 1374 __res.as_int[0] = __i0; 1375 __res.as_int[1] = __i1; 1376 return (__res.as_m64); 1377 } 1378 1379 extern __inline __m64 1380 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1381 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { 1382 return _mm_set_pi16(__w3, __w2, __w1, __w0); 1383 } 1384 1385 extern __inline __m64 1386 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1387 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, 1388 char __b5, char __b6, char __b7) { 1389 return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 1390 } 1391 1392 /* Creates a vector of two 32-bit values, both elements containing I. */ 1393 extern __inline __m64 1394 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1395 _mm_set1_pi32(int __i) { 1396 __m64_union __res; 1397 1398 __res.as_int[0] = __i; 1399 __res.as_int[1] = __i; 1400 return (__res.as_m64); 1401 } 1402 1403 /* Creates a vector of four 16-bit values, all elements containing W. */ 1404 extern __inline __m64 1405 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1406 _mm_set1_pi16(short __w) { 1407 #if _ARCH_PWR9 1408 __vector signed short w; 1409 1410 w = (__vector signed short)vec_splats(__w); 1411 return (__m64)((__vector long long)w)[0]; 1412 #else 1413 __m64_union __res; 1414 1415 __res.as_short[0] = __w; 1416 __res.as_short[1] = __w; 1417 __res.as_short[2] = __w; 1418 __res.as_short[3] = __w; 1419 return (__res.as_m64); 1420 #endif 1421 } 1422 1423 /* Creates a vector of eight 8-bit values, all elements containing B. */ 1424 extern __inline __m64 1425 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1426 _mm_set1_pi8(signed char __b) { 1427 #if _ARCH_PWR8 1428 __vector signed char __res; 1429 1430 __res = (__vector signed char)vec_splats(__b); 1431 return (__m64)((__vector long long)__res)[0]; 1432 #else 1433 __m64_union __res; 1434 1435 __res.as_char[0] = __b; 1436 __res.as_char[1] = __b; 1437 __res.as_char[2] = __b; 1438 __res.as_char[3] = __b; 1439 __res.as_char[4] = __b; 1440 __res.as_char[5] = __b; 1441 __res.as_char[6] = __b; 1442 __res.as_char[7] = __b; 1443 return (__res.as_m64); 1444 #endif 1445 } 1446 1447 #else 1448 #include_next <mmintrin.h> 1449 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ 1450 */ 1451 1452 #endif /* _MMINTRIN_H_INCLUDED */ 1453