1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type, 18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations. 19 However scalar float operations in vector (XMM) registers require 20 the POWER8 VSX ISA (2.07) level. There are differences for data 21 format and placement of float scalars in the vector register, which 22 require extra steps to match SSE2 scalar float semantics on POWER. 23 24 It should be noted that there's much difference between X86_64's 25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use 26 portable <fenv.h> instead of access MXSCR directly. 27 28 Most SSE2 scalar float intrinsic operations can be performed more 29 efficiently as C language float scalar operations or optimized to 30 use vector SIMD operations. We recommend this for new applications. 31 */ 32 #error \ 33 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 34 #endif 35 36 #ifndef EMMINTRIN_H_ 37 #define EMMINTRIN_H_ 38 39 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) 40 41 #include <altivec.h> 42 43 /* We need definitions from the SSE header files. */ 44 #include <xmmintrin.h> 45 46 /* SSE2 */ 47 typedef __vector double __v2df; 48 typedef __vector long long __v2di; 49 typedef __vector unsigned long long __v2du; 50 typedef __vector int __v4si; 51 typedef __vector unsigned int __v4su; 52 typedef __vector short __v8hi; 53 typedef __vector unsigned short __v8hu; 54 typedef __vector signed char __v16qi; 55 typedef __vector unsigned char __v16qu; 56 57 /* The Intel API is flexible enough that we must allow aliasing with other 58 vector types, and their scalar components. */ 59 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); 60 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__)); 61 62 /* Unaligned version of the same types. */ 63 typedef long long __m128i_u 64 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 65 typedef double __m128d_u 66 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 67 68 /* Define two value permute mask. */ 69 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 70 71 /* Create a vector with element 0 as F and the rest zero. */ 72 extern __inline __m128d 73 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 74 _mm_set_sd(double __F) { 75 return __extension__(__m128d){__F, 0.0}; 76 } 77 78 /* Create a vector with both elements equal to F. */ 79 extern __inline __m128d 80 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 81 _mm_set1_pd(double __F) { 82 return __extension__(__m128d){__F, __F}; 83 } 84 85 extern __inline __m128d 86 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 87 _mm_set_pd1(double __F) { 88 return _mm_set1_pd(__F); 89 } 90 91 /* Create a vector with the lower value X and upper value W. */ 92 extern __inline __m128d 93 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 94 _mm_set_pd(double __W, double __X) { 95 return __extension__(__m128d){__X, __W}; 96 } 97 98 /* Create a vector with the lower value W and upper value X. */ 99 extern __inline __m128d 100 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 101 _mm_setr_pd(double __W, double __X) { 102 return __extension__(__m128d){__W, __X}; 103 } 104 105 /* Create an undefined vector. */ 106 extern __inline __m128d 107 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 108 _mm_undefined_pd(void) { 109 __m128d __Y = __Y; 110 return __Y; 111 } 112 113 /* Create a vector of zeros. */ 114 extern __inline __m128d 115 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 116 _mm_setzero_pd(void) { 117 return (__m128d)vec_splats(0); 118 } 119 120 /* Sets the low DPFP value of A from the low value of B. */ 121 extern __inline __m128d 122 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 123 _mm_move_sd(__m128d __A, __m128d __B) { 124 __v2df __result = (__v2df)__A; 125 __result[0] = ((__v2df)__B)[0]; 126 return (__m128d)__result; 127 } 128 129 /* Load two DPFP values from P. The address must be 16-byte aligned. */ 130 extern __inline __m128d 131 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 132 _mm_load_pd(double const *__P) { 133 return ((__m128d)vec_ld(0, (__v16qu *)__P)); 134 } 135 136 /* Load two DPFP values from P. The address need not be 16-byte aligned. */ 137 extern __inline __m128d 138 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 139 _mm_loadu_pd(double const *__P) { 140 return (vec_vsx_ld(0, __P)); 141 } 142 143 /* Create a vector with all two elements equal to *P. */ 144 extern __inline __m128d 145 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 146 _mm_load1_pd(double const *__P) { 147 return (vec_splats(*__P)); 148 } 149 150 /* Create a vector with element 0 as *P and the rest zero. */ 151 extern __inline __m128d 152 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 153 _mm_load_sd(double const *__P) { 154 return _mm_set_sd(*__P); 155 } 156 157 extern __inline __m128d 158 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 159 _mm_load_pd1(double const *__P) { 160 return _mm_load1_pd(__P); 161 } 162 163 /* Load two DPFP values in reverse order. The address must be aligned. */ 164 extern __inline __m128d 165 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 166 _mm_loadr_pd(double const *__P) { 167 __v2df __tmp = _mm_load_pd(__P); 168 return (__m128d)vec_xxpermdi(__tmp, __tmp, 2); 169 } 170 171 /* Store two DPFP values. The address must be 16-byte aligned. */ 172 extern __inline void 173 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 174 _mm_store_pd(double *__P, __m128d __A) { 175 vec_st((__v16qu)__A, 0, (__v16qu *)__P); 176 } 177 178 /* Store two DPFP values. The address need not be 16-byte aligned. */ 179 extern __inline void 180 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 181 _mm_storeu_pd(double *__P, __m128d __A) { 182 *(__m128d_u *)__P = __A; 183 } 184 185 /* Stores the lower DPFP value. */ 186 extern __inline void 187 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 188 _mm_store_sd(double *__P, __m128d __A) { 189 *__P = ((__v2df)__A)[0]; 190 } 191 192 extern __inline double 193 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 194 _mm_cvtsd_f64(__m128d __A) { 195 return ((__v2df)__A)[0]; 196 } 197 198 extern __inline void 199 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 200 _mm_storel_pd(double *__P, __m128d __A) { 201 _mm_store_sd(__P, __A); 202 } 203 204 /* Stores the upper DPFP value. */ 205 extern __inline void 206 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 207 _mm_storeh_pd(double *__P, __m128d __A) { 208 *__P = ((__v2df)__A)[1]; 209 } 210 /* Store the lower DPFP value across two words. 211 The address must be 16-byte aligned. */ 212 extern __inline void 213 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 214 _mm_store1_pd(double *__P, __m128d __A) { 215 _mm_store_pd(__P, vec_splat(__A, 0)); 216 } 217 218 extern __inline void 219 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 220 _mm_store_pd1(double *__P, __m128d __A) { 221 _mm_store1_pd(__P, __A); 222 } 223 224 /* Store two DPFP values in reverse order. The address must be aligned. */ 225 extern __inline void 226 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 227 _mm_storer_pd(double *__P, __m128d __A) { 228 _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2)); 229 } 230 231 /* Intel intrinsic. */ 232 extern __inline long long 233 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 234 _mm_cvtsi128_si64(__m128i __A) { 235 return ((__v2di)__A)[0]; 236 } 237 238 /* Microsoft intrinsic. */ 239 extern __inline long long 240 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 241 _mm_cvtsi128_si64x(__m128i __A) { 242 return ((__v2di)__A)[0]; 243 } 244 245 extern __inline __m128d 246 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 247 _mm_add_pd(__m128d __A, __m128d __B) { 248 return (__m128d)((__v2df)__A + (__v2df)__B); 249 } 250 251 /* Add the lower double-precision (64-bit) floating-point element in 252 a and b, store the result in the lower element of dst, and copy 253 the upper element from a to the upper element of dst. */ 254 extern __inline __m128d 255 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 256 _mm_add_sd(__m128d __A, __m128d __B) { 257 __A[0] = __A[0] + __B[0]; 258 return (__A); 259 } 260 261 extern __inline __m128d 262 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 263 _mm_sub_pd(__m128d __A, __m128d __B) { 264 return (__m128d)((__v2df)__A - (__v2df)__B); 265 } 266 267 extern __inline __m128d 268 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 269 _mm_sub_sd(__m128d __A, __m128d __B) { 270 __A[0] = __A[0] - __B[0]; 271 return (__A); 272 } 273 274 extern __inline __m128d 275 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 276 _mm_mul_pd(__m128d __A, __m128d __B) { 277 return (__m128d)((__v2df)__A * (__v2df)__B); 278 } 279 280 extern __inline __m128d 281 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 282 _mm_mul_sd(__m128d __A, __m128d __B) { 283 __A[0] = __A[0] * __B[0]; 284 return (__A); 285 } 286 287 extern __inline __m128d 288 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 289 _mm_div_pd(__m128d __A, __m128d __B) { 290 return (__m128d)((__v2df)__A / (__v2df)__B); 291 } 292 293 extern __inline __m128d 294 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 295 _mm_div_sd(__m128d __A, __m128d __B) { 296 __A[0] = __A[0] / __B[0]; 297 return (__A); 298 } 299 300 extern __inline __m128d 301 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 302 _mm_sqrt_pd(__m128d __A) { 303 return (vec_sqrt(__A)); 304 } 305 306 /* Return pair {sqrt (B[0]), A[1]}. */ 307 extern __inline __m128d 308 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 309 _mm_sqrt_sd(__m128d __A, __m128d __B) { 310 __v2df __c; 311 __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0])); 312 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 313 } 314 315 extern __inline __m128d 316 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 317 _mm_min_pd(__m128d __A, __m128d __B) { 318 return (vec_min(__A, __B)); 319 } 320 321 extern __inline __m128d 322 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 323 _mm_min_sd(__m128d __A, __m128d __B) { 324 __v2df __a, __b, __c; 325 __a = vec_splats(__A[0]); 326 __b = vec_splats(__B[0]); 327 __c = vec_min(__a, __b); 328 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 329 } 330 331 extern __inline __m128d 332 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 333 _mm_max_pd(__m128d __A, __m128d __B) { 334 return (vec_max(__A, __B)); 335 } 336 337 extern __inline __m128d 338 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 339 _mm_max_sd(__m128d __A, __m128d __B) { 340 __v2df __a, __b, __c; 341 __a = vec_splats(__A[0]); 342 __b = vec_splats(__B[0]); 343 __c = vec_max(__a, __b); 344 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 345 } 346 347 extern __inline __m128d 348 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 349 _mm_cmpeq_pd(__m128d __A, __m128d __B) { 350 return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B)); 351 } 352 353 extern __inline __m128d 354 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 355 _mm_cmplt_pd(__m128d __A, __m128d __B) { 356 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B)); 357 } 358 359 extern __inline __m128d 360 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 361 _mm_cmple_pd(__m128d __A, __m128d __B) { 362 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B)); 363 } 364 365 extern __inline __m128d 366 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 367 _mm_cmpgt_pd(__m128d __A, __m128d __B) { 368 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B)); 369 } 370 371 extern __inline __m128d 372 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 373 _mm_cmpge_pd(__m128d __A, __m128d __B) { 374 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B)); 375 } 376 377 extern __inline __m128d 378 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 379 _mm_cmpneq_pd(__m128d __A, __m128d __B) { 380 __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B); 381 return ((__m128d)vec_nor(__temp, __temp)); 382 } 383 384 extern __inline __m128d 385 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 386 _mm_cmpnlt_pd(__m128d __A, __m128d __B) { 387 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B)); 388 } 389 390 extern __inline __m128d 391 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 392 _mm_cmpnle_pd(__m128d __A, __m128d __B) { 393 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B)); 394 } 395 396 extern __inline __m128d 397 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 398 _mm_cmpngt_pd(__m128d __A, __m128d __B) { 399 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B)); 400 } 401 402 extern __inline __m128d 403 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 404 _mm_cmpnge_pd(__m128d __A, __m128d __B) { 405 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B)); 406 } 407 408 extern __inline __m128d 409 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 410 _mm_cmpord_pd(__m128d __A, __m128d __B) { 411 __v2du __c, __d; 412 /* Compare against self will return false (0's) if NAN. */ 413 __c = (__v2du)vec_cmpeq(__A, __A); 414 __d = (__v2du)vec_cmpeq(__B, __B); 415 /* A != NAN and B != NAN. */ 416 return ((__m128d)vec_and(__c, __d)); 417 } 418 419 extern __inline __m128d 420 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 421 _mm_cmpunord_pd(__m128d __A, __m128d __B) { 422 #if _ARCH_PWR8 423 __v2du __c, __d; 424 /* Compare against self will return false (0's) if NAN. */ 425 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A); 426 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B); 427 /* A == NAN OR B == NAN converts too: 428 NOT(A != NAN) OR NOT(B != NAN). */ 429 __c = vec_nor(__c, __c); 430 return ((__m128d)vec_orc(__c, __d)); 431 #else 432 __v2du __c, __d; 433 /* Compare against self will return false (0's) if NAN. */ 434 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A); 435 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B); 436 /* Convert the true ('1's) is NAN. */ 437 __c = vec_nor(__c, __c); 438 __d = vec_nor(__d, __d); 439 return ((__m128d)vec_or(__c, __d)); 440 #endif 441 } 442 443 extern __inline __m128d 444 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 445 _mm_cmpeq_sd(__m128d __A, __m128d __B) { 446 __v2df __a, __b, __c; 447 /* PowerISA VSX does not allow partial (for just lower double) 448 results. So to insure we don't generate spurious exceptions 449 (from the upper double values) we splat the lower double 450 before we do the operation. */ 451 __a = vec_splats(__A[0]); 452 __b = vec_splats(__B[0]); 453 __c = (__v2df)vec_cmpeq(__a, __b); 454 /* Then we merge the lower double result with the original upper 455 double from __A. */ 456 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 457 } 458 459 extern __inline __m128d 460 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 461 _mm_cmplt_sd(__m128d __A, __m128d __B) { 462 __v2df __a, __b, __c; 463 __a = vec_splats(__A[0]); 464 __b = vec_splats(__B[0]); 465 __c = (__v2df)vec_cmplt(__a, __b); 466 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 467 } 468 469 extern __inline __m128d 470 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 471 _mm_cmple_sd(__m128d __A, __m128d __B) { 472 __v2df __a, __b, __c; 473 __a = vec_splats(__A[0]); 474 __b = vec_splats(__B[0]); 475 __c = (__v2df)vec_cmple(__a, __b); 476 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 477 } 478 479 extern __inline __m128d 480 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 481 _mm_cmpgt_sd(__m128d __A, __m128d __B) { 482 __v2df __a, __b, __c; 483 __a = vec_splats(__A[0]); 484 __b = vec_splats(__B[0]); 485 __c = (__v2df)vec_cmpgt(__a, __b); 486 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 487 } 488 489 extern __inline __m128d 490 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 491 _mm_cmpge_sd(__m128d __A, __m128d __B) { 492 __v2df __a, __b, __c; 493 __a = vec_splats(__A[0]); 494 __b = vec_splats(__B[0]); 495 __c = (__v2df)vec_cmpge(__a, __b); 496 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 497 } 498 499 extern __inline __m128d 500 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 501 _mm_cmpneq_sd(__m128d __A, __m128d __B) { 502 __v2df __a, __b, __c; 503 __a = vec_splats(__A[0]); 504 __b = vec_splats(__B[0]); 505 __c = (__v2df)vec_cmpeq(__a, __b); 506 __c = vec_nor(__c, __c); 507 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 508 } 509 510 extern __inline __m128d 511 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 512 _mm_cmpnlt_sd(__m128d __A, __m128d __B) { 513 __v2df __a, __b, __c; 514 __a = vec_splats(__A[0]); 515 __b = vec_splats(__B[0]); 516 /* Not less than is just greater than or equal. */ 517 __c = (__v2df)vec_cmpge(__a, __b); 518 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 519 } 520 521 extern __inline __m128d 522 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 523 _mm_cmpnle_sd(__m128d __A, __m128d __B) { 524 __v2df __a, __b, __c; 525 __a = vec_splats(__A[0]); 526 __b = vec_splats(__B[0]); 527 /* Not less than or equal is just greater than. */ 528 __c = (__v2df)vec_cmpge(__a, __b); 529 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 530 } 531 532 extern __inline __m128d 533 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 534 _mm_cmpngt_sd(__m128d __A, __m128d __B) { 535 __v2df __a, __b, __c; 536 __a = vec_splats(__A[0]); 537 __b = vec_splats(__B[0]); 538 /* Not greater than is just less than or equal. */ 539 __c = (__v2df)vec_cmple(__a, __b); 540 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 541 } 542 543 extern __inline __m128d 544 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 545 _mm_cmpnge_sd(__m128d __A, __m128d __B) { 546 __v2df __a, __b, __c; 547 __a = vec_splats(__A[0]); 548 __b = vec_splats(__B[0]); 549 /* Not greater than or equal is just less than. */ 550 __c = (__v2df)vec_cmplt(__a, __b); 551 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 552 } 553 554 extern __inline __m128d 555 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 556 _mm_cmpord_sd(__m128d __A, __m128d __B) { 557 __v2df __r; 558 __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0])); 559 return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]); 560 } 561 562 extern __inline __m128d 563 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 564 _mm_cmpunord_sd(__m128d __A, __m128d __B) { 565 __v2df __r; 566 __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0])); 567 return (__m128d)_mm_setr_pd(__r[0], __A[1]); 568 } 569 570 /* FIXME 571 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are 572 exactly the same because GCC for PowerPC only generates unordered 573 compares (scalar and vector). 574 Technically __mm_comieq_sp et all should be using the ordered 575 compare and signal for QNaNs. The __mm_ucomieq_sd et all should 576 be OK. */ 577 extern __inline int 578 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 579 _mm_comieq_sd(__m128d __A, __m128d __B) { 580 return (__A[0] == __B[0]); 581 } 582 583 extern __inline int 584 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 585 _mm_comilt_sd(__m128d __A, __m128d __B) { 586 return (__A[0] < __B[0]); 587 } 588 589 extern __inline int 590 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 591 _mm_comile_sd(__m128d __A, __m128d __B) { 592 return (__A[0] <= __B[0]); 593 } 594 595 extern __inline int 596 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 597 _mm_comigt_sd(__m128d __A, __m128d __B) { 598 return (__A[0] > __B[0]); 599 } 600 601 extern __inline int 602 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 603 _mm_comige_sd(__m128d __A, __m128d __B) { 604 return (__A[0] >= __B[0]); 605 } 606 607 extern __inline int 608 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 609 _mm_comineq_sd(__m128d __A, __m128d __B) { 610 return (__A[0] != __B[0]); 611 } 612 613 extern __inline int 614 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 615 _mm_ucomieq_sd(__m128d __A, __m128d __B) { 616 return (__A[0] == __B[0]); 617 } 618 619 extern __inline int 620 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 621 _mm_ucomilt_sd(__m128d __A, __m128d __B) { 622 return (__A[0] < __B[0]); 623 } 624 625 extern __inline int 626 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 627 _mm_ucomile_sd(__m128d __A, __m128d __B) { 628 return (__A[0] <= __B[0]); 629 } 630 631 extern __inline int 632 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 633 _mm_ucomigt_sd(__m128d __A, __m128d __B) { 634 return (__A[0] > __B[0]); 635 } 636 637 extern __inline int 638 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 639 _mm_ucomige_sd(__m128d __A, __m128d __B) { 640 return (__A[0] >= __B[0]); 641 } 642 643 extern __inline int 644 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 645 _mm_ucomineq_sd(__m128d __A, __m128d __B) { 646 return (__A[0] != __B[0]); 647 } 648 649 /* Create a vector of Qi, where i is the element number. */ 650 extern __inline __m128i 651 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 652 _mm_set_epi64x(long long __q1, long long __q0) { 653 return __extension__(__m128i)(__v2di){__q0, __q1}; 654 } 655 656 extern __inline __m128i 657 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 658 _mm_set_epi64(__m64 __q1, __m64 __q0) { 659 return _mm_set_epi64x((long long)__q1, (long long)__q0); 660 } 661 662 extern __inline __m128i 663 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 664 _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) { 665 return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3}; 666 } 667 668 extern __inline __m128i 669 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 670 _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3, 671 short __q2, short __q1, short __q0) { 672 return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3, 673 __q4, __q5, __q6, __q7}; 674 } 675 676 extern __inline __m128i 677 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 678 _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11, 679 char __q10, char __q09, char __q08, char __q07, char __q06, 680 char __q05, char __q04, char __q03, char __q02, char __q01, 681 char __q00) { 682 return __extension__(__m128i)(__v16qi){ 683 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 684 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; 685 } 686 687 /* Set all of the elements of the vector to A. */ 688 extern __inline __m128i 689 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 690 _mm_set1_epi64x(long long __A) { 691 return _mm_set_epi64x(__A, __A); 692 } 693 694 extern __inline __m128i 695 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 696 _mm_set1_epi64(__m64 __A) { 697 return _mm_set_epi64(__A, __A); 698 } 699 700 extern __inline __m128i 701 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 702 _mm_set1_epi32(int __A) { 703 return _mm_set_epi32(__A, __A, __A, __A); 704 } 705 706 extern __inline __m128i 707 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 708 _mm_set1_epi16(short __A) { 709 return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A); 710 } 711 712 extern __inline __m128i 713 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 714 _mm_set1_epi8(char __A) { 715 return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, 716 __A, __A, __A, __A, __A); 717 } 718 719 /* Create a vector of Qi, where i is the element number. 720 The parameter order is reversed from the _mm_set_epi* functions. */ 721 extern __inline __m128i 722 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 723 _mm_setr_epi64(__m64 __q0, __m64 __q1) { 724 return _mm_set_epi64(__q1, __q0); 725 } 726 727 extern __inline __m128i 728 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 729 _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) { 730 return _mm_set_epi32(__q3, __q2, __q1, __q0); 731 } 732 733 extern __inline __m128i 734 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 735 _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4, 736 short __q5, short __q6, short __q7) { 737 return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 738 } 739 740 extern __inline __m128i 741 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 742 _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04, 743 char __q05, char __q06, char __q07, char __q08, char __q09, 744 char __q10, char __q11, char __q12, char __q13, char __q14, 745 char __q15) { 746 return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 747 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 748 } 749 750 /* Create a vector with element 0 as *P and the rest zero. */ 751 extern __inline __m128i 752 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 753 _mm_load_si128(__m128i const *__P) { 754 return *__P; 755 } 756 757 extern __inline __m128i 758 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 759 _mm_loadu_si128(__m128i_u const *__P) { 760 return (__m128i)(vec_vsx_ld(0, (signed int const *)__P)); 761 } 762 763 extern __inline __m128i 764 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 765 _mm_loadl_epi64(__m128i_u const *__P) { 766 return _mm_set_epi64((__m64)0LL, *(__m64 *)__P); 767 } 768 769 extern __inline void 770 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 771 _mm_store_si128(__m128i *__P, __m128i __B) { 772 vec_st((__v16qu)__B, 0, (__v16qu *)__P); 773 } 774 775 extern __inline void 776 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 777 _mm_storeu_si128(__m128i_u *__P, __m128i __B) { 778 *__P = __B; 779 } 780 781 extern __inline void 782 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 783 _mm_storel_epi64(__m128i_u *__P, __m128i __B) { 784 *(long long *)__P = ((__v2di)__B)[0]; 785 } 786 787 extern __inline __m64 788 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 789 _mm_movepi64_pi64(__m128i_u __B) { 790 return (__m64)((__v2di)__B)[0]; 791 } 792 793 extern __inline __m128i 794 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 795 _mm_movpi64_epi64(__m64 __A) { 796 return _mm_set_epi64((__m64)0LL, __A); 797 } 798 799 extern __inline __m128i 800 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 801 _mm_move_epi64(__m128i __A) { 802 return _mm_set_epi64((__m64)0LL, (__m64)__A[0]); 803 } 804 805 /* Create an undefined vector. */ 806 extern __inline __m128i 807 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 808 _mm_undefined_si128(void) { 809 __m128i __Y = __Y; 810 return __Y; 811 } 812 813 /* Create a vector of zeros. */ 814 extern __inline __m128i 815 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 816 _mm_setzero_si128(void) { 817 return __extension__(__m128i)(__v4si){0, 0, 0, 0}; 818 } 819 820 #ifdef _ARCH_PWR8 821 extern __inline __m128d 822 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 823 _mm_cvtepi32_pd(__m128i __A) { 824 __v2di __val; 825 /* For LE need to generate Vector Unpack Low Signed Word. 826 Which is generated from unpackh. */ 827 __val = (__v2di)vec_unpackh((__v4si)__A); 828 829 return (__m128d)vec_ctf(__val, 0); 830 } 831 #endif 832 833 extern __inline __m128 834 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 835 _mm_cvtepi32_ps(__m128i __A) { 836 return ((__m128)vec_ctf((__v4si)__A, 0)); 837 } 838 839 extern __inline __m128i 840 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 841 _mm_cvtpd_epi32(__m128d __A) { 842 __v2df __rounded = vec_rint(__A); 843 __v4si __result, __temp; 844 const __v4si __vzero = {0, 0, 0, 0}; 845 846 /* VSX Vector truncate Double-Precision to integer and Convert to 847 Signed Integer Word format with Saturate. */ 848 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :); 849 850 #ifdef _ARCH_PWR8 851 #ifdef __LITTLE_ENDIAN__ 852 __temp = vec_mergeo(__temp, __temp); 853 #else 854 __temp = vec_mergee(__temp, __temp); 855 #endif 856 __result = (__v4si)vec_vpkudum((__vector long long)__temp, 857 (__vector long long)__vzero); 858 #else 859 { 860 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 861 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 862 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 863 } 864 #endif 865 return (__m128i)__result; 866 } 867 868 extern __inline __m64 869 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 870 _mm_cvtpd_pi32(__m128d __A) { 871 __m128i __result = _mm_cvtpd_epi32(__A); 872 873 return (__m64)__result[0]; 874 } 875 876 extern __inline __m128 877 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 878 _mm_cvtpd_ps(__m128d __A) { 879 __v4sf __result; 880 __v4si __temp; 881 const __v4si __vzero = {0, 0, 0, 0}; 882 883 __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :); 884 885 #ifdef _ARCH_PWR8 886 #ifdef __LITTLE_ENDIAN__ 887 __temp = vec_mergeo(__temp, __temp); 888 #else 889 __temp = vec_mergee(__temp, __temp); 890 #endif 891 __result = (__v4sf)vec_vpkudum((__vector long long)__temp, 892 (__vector long long)__vzero); 893 #else 894 { 895 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 896 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 897 __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 898 } 899 #endif 900 return ((__m128)__result); 901 } 902 903 extern __inline __m128i 904 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 905 _mm_cvttpd_epi32(__m128d __A) { 906 __v4si __result; 907 __v4si __temp; 908 const __v4si __vzero = {0, 0, 0, 0}; 909 910 /* VSX Vector truncate Double-Precision to integer and Convert to 911 Signed Integer Word format with Saturate. */ 912 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :); 913 914 #ifdef _ARCH_PWR8 915 #ifdef __LITTLE_ENDIAN__ 916 __temp = vec_mergeo(__temp, __temp); 917 #else 918 __temp = vec_mergee(__temp, __temp); 919 #endif 920 __result = (__v4si)vec_vpkudum((__vector long long)__temp, 921 (__vector long long)__vzero); 922 #else 923 { 924 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 925 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 926 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 927 } 928 #endif 929 930 return ((__m128i)__result); 931 } 932 933 extern __inline __m64 934 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 935 _mm_cvttpd_pi32(__m128d __A) { 936 __m128i __result = _mm_cvttpd_epi32(__A); 937 938 return (__m64)__result[0]; 939 } 940 941 extern __inline int 942 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 943 _mm_cvtsi128_si32(__m128i __A) { 944 return ((__v4si)__A)[0]; 945 } 946 947 #ifdef _ARCH_PWR8 948 extern __inline __m128d 949 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 950 _mm_cvtpi32_pd(__m64 __A) { 951 __v4si __temp; 952 __v2di __tmp2; 953 __v2df __result; 954 955 __temp = (__v4si)vec_splats(__A); 956 __tmp2 = (__v2di)vec_unpackl(__temp); 957 __result = vec_ctf((__vector signed long long)__tmp2, 0); 958 return (__m128d)__result; 959 } 960 #endif 961 962 extern __inline __m128i 963 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 964 _mm_cvtps_epi32(__m128 __A) { 965 __v4sf __rounded; 966 __v4si __result; 967 968 __rounded = vec_rint((__v4sf)__A); 969 __result = vec_cts(__rounded, 0); 970 return (__m128i)__result; 971 } 972 973 extern __inline __m128i 974 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 975 _mm_cvttps_epi32(__m128 __A) { 976 __v4si __result; 977 978 __result = vec_cts((__v4sf)__A, 0); 979 return (__m128i)__result; 980 } 981 982 extern __inline __m128d 983 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 984 _mm_cvtps_pd(__m128 __A) { 985 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ 986 #ifdef vec_doubleh 987 return (__m128d)vec_doubleh((__v4sf)__A); 988 #else 989 /* Otherwise the compiler is not current and so need to generate the 990 equivalent code. */ 991 __v4sf __a = (__v4sf)__A; 992 __v4sf __temp; 993 __v2df __result; 994 #ifdef __LITTLE_ENDIAN__ 995 /* The input float values are in elements {[0], [1]} but the convert 996 instruction needs them in elements {[1], [3]}, So we use two 997 shift left double vector word immediates to get the elements 998 lined up. */ 999 __temp = __builtin_vsx_xxsldwi(__a, __a, 3); 1000 __temp = __builtin_vsx_xxsldwi(__a, __temp, 2); 1001 #else 1002 /* The input float values are in elements {[0], [1]} but the convert 1003 instruction needs them in elements {[0], [2]}, So we use two 1004 shift left double vector word immediates to get the elements 1005 lined up. */ 1006 __temp = vec_vmrghw(__a, __a); 1007 #endif 1008 __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :); 1009 return (__m128d)__result; 1010 #endif 1011 } 1012 1013 extern __inline int 1014 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1015 _mm_cvtsd_si32(__m128d __A) { 1016 __v2df __rounded = vec_rint((__v2df)__A); 1017 int __result = ((__v2df)__rounded)[0]; 1018 1019 return __result; 1020 } 1021 /* Intel intrinsic. */ 1022 extern __inline long long 1023 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1024 _mm_cvtsd_si64(__m128d __A) { 1025 __v2df __rounded = vec_rint((__v2df)__A); 1026 long long __result = ((__v2df)__rounded)[0]; 1027 1028 return __result; 1029 } 1030 1031 /* Microsoft intrinsic. */ 1032 extern __inline long long 1033 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1034 _mm_cvtsd_si64x(__m128d __A) { 1035 return _mm_cvtsd_si64((__v2df)__A); 1036 } 1037 1038 extern __inline int 1039 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1040 _mm_cvttsd_si32(__m128d __A) { 1041 int __result = ((__v2df)__A)[0]; 1042 1043 return __result; 1044 } 1045 1046 /* Intel intrinsic. */ 1047 extern __inline long long 1048 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1049 _mm_cvttsd_si64(__m128d __A) { 1050 long long __result = ((__v2df)__A)[0]; 1051 1052 return __result; 1053 } 1054 1055 /* Microsoft intrinsic. */ 1056 extern __inline long long 1057 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1058 _mm_cvttsd_si64x(__m128d __A) { 1059 return _mm_cvttsd_si64(__A); 1060 } 1061 1062 extern __inline __m128 1063 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1064 _mm_cvtsd_ss(__m128 __A, __m128d __B) { 1065 __v4sf __result = (__v4sf)__A; 1066 1067 #ifdef __LITTLE_ENDIAN__ 1068 __v4sf __temp_s; 1069 /* Copy double element[0] to element [1] for conversion. */ 1070 __v2df __temp_b = vec_splat((__v2df)__B, 0); 1071 1072 /* Pre-rotate __A left 3 (logically right 1) elements. */ 1073 __result = __builtin_vsx_xxsldwi(__result, __result, 3); 1074 /* Convert double to single float scalar in a vector. */ 1075 __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :); 1076 /* Shift the resulting scalar into vector element [0]. */ 1077 __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1); 1078 #else 1079 __result[0] = ((__v2df)__B)[0]; 1080 #endif 1081 return (__m128)__result; 1082 } 1083 1084 extern __inline __m128d 1085 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1086 _mm_cvtsi32_sd(__m128d __A, int __B) { 1087 __v2df __result = (__v2df)__A; 1088 double __db = __B; 1089 __result[0] = __db; 1090 return (__m128d)__result; 1091 } 1092 1093 /* Intel intrinsic. */ 1094 extern __inline __m128d 1095 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1096 _mm_cvtsi64_sd(__m128d __A, long long __B) { 1097 __v2df __result = (__v2df)__A; 1098 double __db = __B; 1099 __result[0] = __db; 1100 return (__m128d)__result; 1101 } 1102 1103 /* Microsoft intrinsic. */ 1104 extern __inline __m128d 1105 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1106 _mm_cvtsi64x_sd(__m128d __A, long long __B) { 1107 return _mm_cvtsi64_sd(__A, __B); 1108 } 1109 1110 extern __inline __m128d 1111 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1112 _mm_cvtss_sd(__m128d __A, __m128 __B) { 1113 #ifdef __LITTLE_ENDIAN__ 1114 /* Use splat to move element [0] into position for the convert. */ 1115 __v4sf __temp = vec_splat((__v4sf)__B, 0); 1116 __v2df __res; 1117 /* Convert single float scalar to double in a vector. */ 1118 __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :); 1119 return (__m128d)vec_mergel(__res, (__v2df)__A); 1120 #else 1121 __v2df __res = (__v2df)__A; 1122 __res[0] = ((__v4sf)__B)[0]; 1123 return (__m128d)__res; 1124 #endif 1125 } 1126 1127 extern __inline __m128d 1128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1129 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) { 1130 __vector double __result; 1131 const int __litmsk = __mask & 0x3; 1132 1133 if (__litmsk == 0) 1134 __result = vec_mergeh(__A, __B); 1135 #if __GNUC__ < 6 1136 else if (__litmsk == 1) 1137 __result = vec_xxpermdi(__B, __A, 2); 1138 else if (__litmsk == 2) 1139 __result = vec_xxpermdi(__B, __A, 1); 1140 #else 1141 else if (__litmsk == 1) 1142 __result = vec_xxpermdi(__A, __B, 2); 1143 else if (__litmsk == 2) 1144 __result = vec_xxpermdi(__A, __B, 1); 1145 #endif 1146 else 1147 __result = vec_mergel(__A, __B); 1148 1149 return __result; 1150 } 1151 1152 extern __inline __m128d 1153 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1154 _mm_unpackhi_pd(__m128d __A, __m128d __B) { 1155 return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B); 1156 } 1157 1158 extern __inline __m128d 1159 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1160 _mm_unpacklo_pd(__m128d __A, __m128d __B) { 1161 return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B); 1162 } 1163 1164 extern __inline __m128d 1165 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1166 _mm_loadh_pd(__m128d __A, double const *__B) { 1167 __v2df __result = (__v2df)__A; 1168 __result[1] = *__B; 1169 return (__m128d)__result; 1170 } 1171 1172 extern __inline __m128d 1173 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1174 _mm_loadl_pd(__m128d __A, double const *__B) { 1175 __v2df __result = (__v2df)__A; 1176 __result[0] = *__B; 1177 return (__m128d)__result; 1178 } 1179 1180 #ifdef _ARCH_PWR8 1181 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1182 1183 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */ 1184 extern __inline int 1185 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1186 _mm_movemask_pd(__m128d __A) { 1187 #ifdef _ARCH_PWR10 1188 return vec_extractm((__v2du)__A); 1189 #else 1190 __vector unsigned long long __result; 1191 static const __vector unsigned int __perm_mask = { 1192 #ifdef __LITTLE_ENDIAN__ 1193 0x80800040, 0x80808080, 0x80808080, 0x80808080 1194 #else 1195 0x80808080, 0x80808080, 0x80808080, 0x80804000 1196 #endif 1197 }; 1198 1199 __result = ((__vector unsigned long long)vec_vbpermq( 1200 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); 1201 1202 #ifdef __LITTLE_ENDIAN__ 1203 return __result[1]; 1204 #else 1205 return __result[0]; 1206 #endif 1207 #endif /* !_ARCH_PWR10 */ 1208 } 1209 #endif /* _ARCH_PWR8 */ 1210 1211 extern __inline __m128i 1212 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1213 _mm_packs_epi16(__m128i __A, __m128i __B) { 1214 return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B); 1215 } 1216 1217 extern __inline __m128i 1218 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1219 _mm_packs_epi32(__m128i __A, __m128i __B) { 1220 return (__m128i)vec_packs((__v4si)__A, (__v4si)__B); 1221 } 1222 1223 extern __inline __m128i 1224 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1225 _mm_packus_epi16(__m128i __A, __m128i __B) { 1226 return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B); 1227 } 1228 1229 extern __inline __m128i 1230 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1231 _mm_unpackhi_epi8(__m128i __A, __m128i __B) { 1232 return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B); 1233 } 1234 1235 extern __inline __m128i 1236 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1237 _mm_unpackhi_epi16(__m128i __A, __m128i __B) { 1238 return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B); 1239 } 1240 1241 extern __inline __m128i 1242 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1243 _mm_unpackhi_epi32(__m128i __A, __m128i __B) { 1244 return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B); 1245 } 1246 1247 extern __inline __m128i 1248 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1249 _mm_unpackhi_epi64(__m128i __A, __m128i __B) { 1250 return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B); 1251 } 1252 1253 extern __inline __m128i 1254 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1255 _mm_unpacklo_epi8(__m128i __A, __m128i __B) { 1256 return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B); 1257 } 1258 1259 extern __inline __m128i 1260 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1261 _mm_unpacklo_epi16(__m128i __A, __m128i __B) { 1262 return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B); 1263 } 1264 1265 extern __inline __m128i 1266 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1267 _mm_unpacklo_epi32(__m128i __A, __m128i __B) { 1268 return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B); 1269 } 1270 1271 extern __inline __m128i 1272 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1273 _mm_unpacklo_epi64(__m128i __A, __m128i __B) { 1274 return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B); 1275 } 1276 1277 extern __inline __m128i 1278 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1279 _mm_add_epi8(__m128i __A, __m128i __B) { 1280 return (__m128i)((__v16qu)__A + (__v16qu)__B); 1281 } 1282 1283 extern __inline __m128i 1284 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1285 _mm_add_epi16(__m128i __A, __m128i __B) { 1286 return (__m128i)((__v8hu)__A + (__v8hu)__B); 1287 } 1288 1289 extern __inline __m128i 1290 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1291 _mm_add_epi32(__m128i __A, __m128i __B) { 1292 return (__m128i)((__v4su)__A + (__v4su)__B); 1293 } 1294 1295 extern __inline __m128i 1296 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1297 _mm_add_epi64(__m128i __A, __m128i __B) { 1298 return (__m128i)((__v2du)__A + (__v2du)__B); 1299 } 1300 1301 extern __inline __m128i 1302 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1303 _mm_adds_epi8(__m128i __A, __m128i __B) { 1304 return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B); 1305 } 1306 1307 extern __inline __m128i 1308 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1309 _mm_adds_epi16(__m128i __A, __m128i __B) { 1310 return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B); 1311 } 1312 1313 extern __inline __m128i 1314 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1315 _mm_adds_epu8(__m128i __A, __m128i __B) { 1316 return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B); 1317 } 1318 1319 extern __inline __m128i 1320 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1321 _mm_adds_epu16(__m128i __A, __m128i __B) { 1322 return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B); 1323 } 1324 1325 extern __inline __m128i 1326 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1327 _mm_sub_epi8(__m128i __A, __m128i __B) { 1328 return (__m128i)((__v16qu)__A - (__v16qu)__B); 1329 } 1330 1331 extern __inline __m128i 1332 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1333 _mm_sub_epi16(__m128i __A, __m128i __B) { 1334 return (__m128i)((__v8hu)__A - (__v8hu)__B); 1335 } 1336 1337 extern __inline __m128i 1338 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1339 _mm_sub_epi32(__m128i __A, __m128i __B) { 1340 return (__m128i)((__v4su)__A - (__v4su)__B); 1341 } 1342 1343 extern __inline __m128i 1344 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1345 _mm_sub_epi64(__m128i __A, __m128i __B) { 1346 return (__m128i)((__v2du)__A - (__v2du)__B); 1347 } 1348 1349 extern __inline __m128i 1350 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1351 _mm_subs_epi8(__m128i __A, __m128i __B) { 1352 return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B); 1353 } 1354 1355 extern __inline __m128i 1356 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1357 _mm_subs_epi16(__m128i __A, __m128i __B) { 1358 return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B); 1359 } 1360 1361 extern __inline __m128i 1362 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1363 _mm_subs_epu8(__m128i __A, __m128i __B) { 1364 return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B); 1365 } 1366 1367 extern __inline __m128i 1368 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1369 _mm_subs_epu16(__m128i __A, __m128i __B) { 1370 return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B); 1371 } 1372 1373 extern __inline __m128i 1374 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1375 _mm_madd_epi16(__m128i __A, __m128i __B) { 1376 __vector signed int __zero = {0, 0, 0, 0}; 1377 1378 return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero); 1379 } 1380 1381 extern __inline __m128i 1382 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1383 _mm_mulhi_epi16(__m128i __A, __m128i __B) { 1384 __vector signed int __w0, __w1; 1385 1386 __vector unsigned char __xform1 = { 1387 #ifdef __LITTLE_ENDIAN__ 1388 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1389 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1390 #else 1391 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 1392 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1393 #endif 1394 }; 1395 1396 __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B); 1397 __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B); 1398 return (__m128i)vec_perm(__w0, __w1, __xform1); 1399 } 1400 1401 extern __inline __m128i 1402 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1403 _mm_mullo_epi16(__m128i __A, __m128i __B) { 1404 return (__m128i)((__v8hi)__A * (__v8hi)__B); 1405 } 1406 1407 extern __inline __m64 1408 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1409 _mm_mul_su32(__m64 __A, __m64 __B) { 1410 unsigned int __a = __A; 1411 unsigned int __b = __B; 1412 1413 return ((__m64)__a * (__m64)__b); 1414 } 1415 1416 #ifdef _ARCH_PWR8 1417 extern __inline __m128i 1418 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1419 _mm_mul_epu32(__m128i __A, __m128i __B) { 1420 #if __GNUC__ < 8 1421 __v2du __result; 1422 1423 #ifdef __LITTLE_ENDIAN__ 1424 /* VMX Vector Multiply Odd Unsigned Word. */ 1425 __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :); 1426 #else 1427 /* VMX Vector Multiply Even Unsigned Word. */ 1428 __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :); 1429 #endif 1430 return (__m128i)__result; 1431 #else 1432 return (__m128i)vec_mule((__v4su)__A, (__v4su)__B); 1433 #endif 1434 } 1435 #endif 1436 1437 extern __inline __m128i 1438 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1439 _mm_slli_epi16(__m128i __A, int __B) { 1440 __v8hu __lshift; 1441 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0}; 1442 1443 if (__B >= 0 && __B < 16) { 1444 if (__builtin_constant_p(__B)) 1445 __lshift = (__v8hu)vec_splat_s16(__B); 1446 else 1447 __lshift = vec_splats((unsigned short)__B); 1448 1449 __result = vec_sl((__v8hi)__A, __lshift); 1450 } 1451 1452 return (__m128i)__result; 1453 } 1454 1455 extern __inline __m128i 1456 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1457 _mm_slli_epi32(__m128i __A, int __B) { 1458 __v4su __lshift; 1459 __v4si __result = {0, 0, 0, 0}; 1460 1461 if (__B >= 0 && __B < 32) { 1462 if (__builtin_constant_p(__B) && __B < 16) 1463 __lshift = (__v4su)vec_splat_s32(__B); 1464 else 1465 __lshift = vec_splats((unsigned int)__B); 1466 1467 __result = vec_sl((__v4si)__A, __lshift); 1468 } 1469 1470 return (__m128i)__result; 1471 } 1472 1473 #ifdef _ARCH_PWR8 1474 extern __inline __m128i 1475 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1476 _mm_slli_epi64(__m128i __A, int __B) { 1477 __v2du __lshift; 1478 __v2di __result = {0, 0}; 1479 1480 if (__B >= 0 && __B < 64) { 1481 if (__builtin_constant_p(__B) && __B < 16) 1482 __lshift = (__v2du)vec_splat_s32(__B); 1483 else 1484 __lshift = (__v2du)vec_splats((unsigned int)__B); 1485 1486 __result = vec_sl((__v2di)__A, __lshift); 1487 } 1488 1489 return (__m128i)__result; 1490 } 1491 #endif 1492 1493 extern __inline __m128i 1494 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1495 _mm_srai_epi16(__m128i __A, int __B) { 1496 __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15}; 1497 __v8hi __result; 1498 1499 if (__B < 16) { 1500 if (__builtin_constant_p(__B)) 1501 __rshift = (__v8hu)vec_splat_s16(__B); 1502 else 1503 __rshift = vec_splats((unsigned short)__B); 1504 } 1505 __result = vec_sra((__v8hi)__A, __rshift); 1506 1507 return (__m128i)__result; 1508 } 1509 1510 extern __inline __m128i 1511 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1512 _mm_srai_epi32(__m128i __A, int __B) { 1513 __v4su __rshift = {31, 31, 31, 31}; 1514 __v4si __result; 1515 1516 if (__B < 32) { 1517 if (__builtin_constant_p(__B)) { 1518 if (__B < 16) 1519 __rshift = (__v4su)vec_splat_s32(__B); 1520 else 1521 __rshift = (__v4su)vec_splats((unsigned int)__B); 1522 } else 1523 __rshift = vec_splats((unsigned int)__B); 1524 } 1525 __result = vec_sra((__v4si)__A, __rshift); 1526 1527 return (__m128i)__result; 1528 } 1529 1530 extern __inline __m128i 1531 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1532 _mm_bslli_si128(__m128i __A, const int __N) { 1533 __v16qu __result; 1534 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1535 1536 if (__N < 16) 1537 __result = vec_sld((__v16qu)__A, __zeros, __N); 1538 else 1539 __result = __zeros; 1540 1541 return (__m128i)__result; 1542 } 1543 1544 extern __inline __m128i 1545 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1546 _mm_bsrli_si128(__m128i __A, const int __N) { 1547 __v16qu __result; 1548 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1549 1550 if (__N < 16) 1551 #ifdef __LITTLE_ENDIAN__ 1552 if (__builtin_constant_p(__N)) 1553 /* Would like to use Vector Shift Left Double by Octet 1554 Immediate here to use the immediate form and avoid 1555 load of __N * 8 value into a separate VR. */ 1556 __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N)); 1557 else 1558 #endif 1559 { 1560 __v16qu __shift = vec_splats((unsigned char)(__N * 8)); 1561 #ifdef __LITTLE_ENDIAN__ 1562 __result = vec_sro((__v16qu)__A, __shift); 1563 #else 1564 __result = vec_slo((__v16qu)__A, __shift); 1565 #endif 1566 } 1567 else 1568 __result = __zeros; 1569 1570 return (__m128i)__result; 1571 } 1572 1573 extern __inline __m128i 1574 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1575 _mm_srli_si128(__m128i __A, const int __N) { 1576 return _mm_bsrli_si128(__A, __N); 1577 } 1578 1579 extern __inline __m128i 1580 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1581 _mm_slli_si128(__m128i __A, const int _imm5) { 1582 __v16qu __result; 1583 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1584 1585 if (_imm5 < 16) 1586 #ifdef __LITTLE_ENDIAN__ 1587 __result = vec_sld((__v16qu)__A, __zeros, _imm5); 1588 #else 1589 __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5)); 1590 #endif 1591 else 1592 __result = __zeros; 1593 1594 return (__m128i)__result; 1595 } 1596 1597 extern __inline __m128i 1598 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1599 1600 _mm_srli_epi16(__m128i __A, int __B) { 1601 __v8hu __rshift; 1602 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0}; 1603 1604 if (__B < 16) { 1605 if (__builtin_constant_p(__B)) 1606 __rshift = (__v8hu)vec_splat_s16(__B); 1607 else 1608 __rshift = vec_splats((unsigned short)__B); 1609 1610 __result = vec_sr((__v8hi)__A, __rshift); 1611 } 1612 1613 return (__m128i)__result; 1614 } 1615 1616 extern __inline __m128i 1617 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1618 _mm_srli_epi32(__m128i __A, int __B) { 1619 __v4su __rshift; 1620 __v4si __result = {0, 0, 0, 0}; 1621 1622 if (__B < 32) { 1623 if (__builtin_constant_p(__B)) { 1624 if (__B < 16) 1625 __rshift = (__v4su)vec_splat_s32(__B); 1626 else 1627 __rshift = (__v4su)vec_splats((unsigned int)__B); 1628 } else 1629 __rshift = vec_splats((unsigned int)__B); 1630 1631 __result = vec_sr((__v4si)__A, __rshift); 1632 } 1633 1634 return (__m128i)__result; 1635 } 1636 1637 #ifdef _ARCH_PWR8 1638 extern __inline __m128i 1639 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1640 _mm_srli_epi64(__m128i __A, int __B) { 1641 __v2du __rshift; 1642 __v2di __result = {0, 0}; 1643 1644 if (__B < 64) { 1645 if (__builtin_constant_p(__B)) { 1646 if (__B < 16) 1647 __rshift = (__v2du)vec_splat_s32(__B); 1648 else 1649 __rshift = (__v2du)vec_splats((unsigned long long)__B); 1650 } else 1651 __rshift = (__v2du)vec_splats((unsigned int)__B); 1652 1653 __result = vec_sr((__v2di)__A, __rshift); 1654 } 1655 1656 return (__m128i)__result; 1657 } 1658 #endif 1659 1660 extern __inline __m128i 1661 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1662 _mm_sll_epi16(__m128i __A, __m128i __B) { 1663 __v8hu __lshift; 1664 __vector __bool short __shmask; 1665 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1666 __v8hu __result; 1667 1668 #ifdef __LITTLE_ENDIAN__ 1669 __lshift = vec_splat((__v8hu)__B, 0); 1670 #else 1671 __lshift = vec_splat((__v8hu)__B, 3); 1672 #endif 1673 __shmask = vec_cmple(__lshift, __shmax); 1674 __result = vec_sl((__v8hu)__A, __lshift); 1675 __result = vec_sel((__v8hu)__shmask, __result, __shmask); 1676 1677 return (__m128i)__result; 1678 } 1679 1680 extern __inline __m128i 1681 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1682 _mm_sll_epi32(__m128i __A, __m128i __B) { 1683 __v4su __lshift; 1684 __vector __bool int __shmask; 1685 const __v4su __shmax = {32, 32, 32, 32}; 1686 __v4su __result; 1687 #ifdef __LITTLE_ENDIAN__ 1688 __lshift = vec_splat((__v4su)__B, 0); 1689 #else 1690 __lshift = vec_splat((__v4su)__B, 1); 1691 #endif 1692 __shmask = vec_cmplt(__lshift, __shmax); 1693 __result = vec_sl((__v4su)__A, __lshift); 1694 __result = vec_sel((__v4su)__shmask, __result, __shmask); 1695 1696 return (__m128i)__result; 1697 } 1698 1699 #ifdef _ARCH_PWR8 1700 extern __inline __m128i 1701 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1702 _mm_sll_epi64(__m128i __A, __m128i __B) { 1703 __v2du __lshift; 1704 __vector __bool long long __shmask; 1705 const __v2du __shmax = {64, 64}; 1706 __v2du __result; 1707 1708 __lshift = vec_splat((__v2du)__B, 0); 1709 __shmask = vec_cmplt(__lshift, __shmax); 1710 __result = vec_sl((__v2du)__A, __lshift); 1711 __result = vec_sel((__v2du)__shmask, __result, __shmask); 1712 1713 return (__m128i)__result; 1714 } 1715 #endif 1716 1717 extern __inline __m128i 1718 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1719 _mm_sra_epi16(__m128i __A, __m128i __B) { 1720 const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1721 __v8hu __rshift; 1722 __v8hi __result; 1723 1724 #ifdef __LITTLE_ENDIAN__ 1725 __rshift = vec_splat((__v8hu)__B, 0); 1726 #else 1727 __rshift = vec_splat((__v8hu)__B, 3); 1728 #endif 1729 __rshift = vec_min(__rshift, __rshmax); 1730 __result = vec_sra((__v8hi)__A, __rshift); 1731 1732 return (__m128i)__result; 1733 } 1734 1735 extern __inline __m128i 1736 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1737 _mm_sra_epi32(__m128i __A, __m128i __B) { 1738 const __v4su __rshmax = {31, 31, 31, 31}; 1739 __v4su __rshift; 1740 __v4si __result; 1741 1742 #ifdef __LITTLE_ENDIAN__ 1743 __rshift = vec_splat((__v4su)__B, 0); 1744 #else 1745 __rshift = vec_splat((__v4su)__B, 1); 1746 #endif 1747 __rshift = vec_min(__rshift, __rshmax); 1748 __result = vec_sra((__v4si)__A, __rshift); 1749 1750 return (__m128i)__result; 1751 } 1752 1753 extern __inline __m128i 1754 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1755 _mm_srl_epi16(__m128i __A, __m128i __B) { 1756 __v8hu __rshift; 1757 __vector __bool short __shmask; 1758 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1759 __v8hu __result; 1760 1761 #ifdef __LITTLE_ENDIAN__ 1762 __rshift = vec_splat((__v8hu)__B, 0); 1763 #else 1764 __rshift = vec_splat((__v8hu)__B, 3); 1765 #endif 1766 __shmask = vec_cmple(__rshift, __shmax); 1767 __result = vec_sr((__v8hu)__A, __rshift); 1768 __result = vec_sel((__v8hu)__shmask, __result, __shmask); 1769 1770 return (__m128i)__result; 1771 } 1772 1773 extern __inline __m128i 1774 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1775 _mm_srl_epi32(__m128i __A, __m128i __B) { 1776 __v4su __rshift; 1777 __vector __bool int __shmask; 1778 const __v4su __shmax = {32, 32, 32, 32}; 1779 __v4su __result; 1780 1781 #ifdef __LITTLE_ENDIAN__ 1782 __rshift = vec_splat((__v4su)__B, 0); 1783 #else 1784 __rshift = vec_splat((__v4su)__B, 1); 1785 #endif 1786 __shmask = vec_cmplt(__rshift, __shmax); 1787 __result = vec_sr((__v4su)__A, __rshift); 1788 __result = vec_sel((__v4su)__shmask, __result, __shmask); 1789 1790 return (__m128i)__result; 1791 } 1792 1793 #ifdef _ARCH_PWR8 1794 extern __inline __m128i 1795 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1796 _mm_srl_epi64(__m128i __A, __m128i __B) { 1797 __v2du __rshift; 1798 __vector __bool long long __shmask; 1799 const __v2du __shmax = {64, 64}; 1800 __v2du __result; 1801 1802 __rshift = vec_splat((__v2du)__B, 0); 1803 __shmask = vec_cmplt(__rshift, __shmax); 1804 __result = vec_sr((__v2du)__A, __rshift); 1805 __result = vec_sel((__v2du)__shmask, __result, __shmask); 1806 1807 return (__m128i)__result; 1808 } 1809 #endif 1810 1811 extern __inline __m128d 1812 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1813 _mm_and_pd(__m128d __A, __m128d __B) { 1814 return (vec_and((__v2df)__A, (__v2df)__B)); 1815 } 1816 1817 extern __inline __m128d 1818 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1819 _mm_andnot_pd(__m128d __A, __m128d __B) { 1820 return (vec_andc((__v2df)__B, (__v2df)__A)); 1821 } 1822 1823 extern __inline __m128d 1824 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1825 _mm_or_pd(__m128d __A, __m128d __B) { 1826 return (vec_or((__v2df)__A, (__v2df)__B)); 1827 } 1828 1829 extern __inline __m128d 1830 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1831 _mm_xor_pd(__m128d __A, __m128d __B) { 1832 return (vec_xor((__v2df)__A, (__v2df)__B)); 1833 } 1834 1835 extern __inline __m128i 1836 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1837 _mm_and_si128(__m128i __A, __m128i __B) { 1838 return (__m128i)vec_and((__v2di)__A, (__v2di)__B); 1839 } 1840 1841 extern __inline __m128i 1842 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1843 _mm_andnot_si128(__m128i __A, __m128i __B) { 1844 return (__m128i)vec_andc((__v2di)__B, (__v2di)__A); 1845 } 1846 1847 extern __inline __m128i 1848 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1849 _mm_or_si128(__m128i __A, __m128i __B) { 1850 return (__m128i)vec_or((__v2di)__A, (__v2di)__B); 1851 } 1852 1853 extern __inline __m128i 1854 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1855 _mm_xor_si128(__m128i __A, __m128i __B) { 1856 return (__m128i)vec_xor((__v2di)__A, (__v2di)__B); 1857 } 1858 1859 extern __inline __m128i 1860 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1861 _mm_cmpeq_epi8(__m128i __A, __m128i __B) { 1862 return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B); 1863 } 1864 1865 extern __inline __m128i 1866 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1867 _mm_cmpeq_epi16(__m128i __A, __m128i __B) { 1868 return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B); 1869 } 1870 1871 extern __inline __m128i 1872 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1873 _mm_cmpeq_epi32(__m128i __A, __m128i __B) { 1874 return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B); 1875 } 1876 1877 extern __inline __m128i 1878 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1879 _mm_cmplt_epi8(__m128i __A, __m128i __B) { 1880 return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B); 1881 } 1882 1883 extern __inline __m128i 1884 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1885 _mm_cmplt_epi16(__m128i __A, __m128i __B) { 1886 return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B); 1887 } 1888 1889 extern __inline __m128i 1890 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1891 _mm_cmplt_epi32(__m128i __A, __m128i __B) { 1892 return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B); 1893 } 1894 1895 extern __inline __m128i 1896 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1897 _mm_cmpgt_epi8(__m128i __A, __m128i __B) { 1898 return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B); 1899 } 1900 1901 extern __inline __m128i 1902 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1903 _mm_cmpgt_epi16(__m128i __A, __m128i __B) { 1904 return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B); 1905 } 1906 1907 extern __inline __m128i 1908 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1909 _mm_cmpgt_epi32(__m128i __A, __m128i __B) { 1910 return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B); 1911 } 1912 1913 extern __inline int 1914 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1915 _mm_extract_epi16(__m128i const __A, int const __N) { 1916 return (unsigned short)((__v8hi)__A)[__N & 7]; 1917 } 1918 1919 extern __inline __m128i 1920 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1921 _mm_insert_epi16(__m128i const __A, int const __D, int const __N) { 1922 __v8hi __result = (__v8hi)__A; 1923 1924 __result[(__N & 7)] = __D; 1925 1926 return (__m128i)__result; 1927 } 1928 1929 extern __inline __m128i 1930 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1931 _mm_max_epi16(__m128i __A, __m128i __B) { 1932 return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B); 1933 } 1934 1935 extern __inline __m128i 1936 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1937 _mm_max_epu8(__m128i __A, __m128i __B) { 1938 return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B); 1939 } 1940 1941 extern __inline __m128i 1942 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1943 _mm_min_epi16(__m128i __A, __m128i __B) { 1944 return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B); 1945 } 1946 1947 extern __inline __m128i 1948 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1949 _mm_min_epu8(__m128i __A, __m128i __B) { 1950 return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B); 1951 } 1952 1953 #ifdef _ARCH_PWR8 1954 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1955 1956 /* Return a mask created from the most significant bit of each 8-bit 1957 element in A. */ 1958 extern __inline int 1959 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1960 _mm_movemask_epi8(__m128i __A) { 1961 #ifdef _ARCH_PWR10 1962 return vec_extractm((__v16qu)__A); 1963 #else 1964 __vector unsigned long long __result; 1965 static const __vector unsigned char __perm_mask = { 1966 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 1967 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00}; 1968 1969 __result = ((__vector unsigned long long)vec_vbpermq( 1970 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); 1971 1972 #ifdef __LITTLE_ENDIAN__ 1973 return __result[1]; 1974 #else 1975 return __result[0]; 1976 #endif 1977 #endif /* !_ARCH_PWR10 */ 1978 } 1979 #endif /* _ARCH_PWR8 */ 1980 1981 extern __inline __m128i 1982 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1983 _mm_mulhi_epu16(__m128i __A, __m128i __B) { 1984 __v4su __w0, __w1; 1985 __v16qu __xform1 = { 1986 #ifdef __LITTLE_ENDIAN__ 1987 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1988 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1989 #else 1990 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 1991 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1992 #endif 1993 }; 1994 1995 __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B); 1996 __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B); 1997 return (__m128i)vec_perm(__w0, __w1, __xform1); 1998 } 1999 2000 extern __inline __m128i 2001 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2002 _mm_shufflehi_epi16(__m128i __A, const int __mask) { 2003 unsigned long __element_selector_98 = __mask & 0x03; 2004 unsigned long __element_selector_BA = (__mask >> 2) & 0x03; 2005 unsigned long __element_selector_DC = (__mask >> 4) & 0x03; 2006 unsigned long __element_selector_FE = (__mask >> 6) & 0x03; 2007 static const unsigned short __permute_selectors[4] = { 2008 #ifdef __LITTLE_ENDIAN__ 2009 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 2010 #else 2011 0x0809, 0x0A0B, 0x0C0D, 0x0E0F 2012 #endif 2013 }; 2014 __v2du __pmask = 2015 #ifdef __LITTLE_ENDIAN__ 2016 {0x1716151413121110UL, 0UL}; 2017 #else 2018 {0x1011121314151617UL, 0UL}; 2019 #endif 2020 __m64_union __t; 2021 __v2du __a, __r; 2022 2023 __t.as_short[0] = __permute_selectors[__element_selector_98]; 2024 __t.as_short[1] = __permute_selectors[__element_selector_BA]; 2025 __t.as_short[2] = __permute_selectors[__element_selector_DC]; 2026 __t.as_short[3] = __permute_selectors[__element_selector_FE]; 2027 __pmask[1] = __t.as_m64; 2028 __a = (__v2du)__A; 2029 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask); 2030 return (__m128i)__r; 2031 } 2032 2033 extern __inline __m128i 2034 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2035 _mm_shufflelo_epi16(__m128i __A, const int __mask) { 2036 unsigned long __element_selector_10 = __mask & 0x03; 2037 unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 2038 unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 2039 unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 2040 static const unsigned short __permute_selectors[4] = { 2041 #ifdef __LITTLE_ENDIAN__ 2042 0x0100, 0x0302, 0x0504, 0x0706 2043 #else 2044 0x0001, 0x0203, 0x0405, 0x0607 2045 #endif 2046 }; 2047 __v2du __pmask = 2048 #ifdef __LITTLE_ENDIAN__ 2049 {0UL, 0x1f1e1d1c1b1a1918UL}; 2050 #else 2051 {0UL, 0x18191a1b1c1d1e1fUL}; 2052 #endif 2053 __m64_union __t; 2054 __v2du __a, __r; 2055 __t.as_short[0] = __permute_selectors[__element_selector_10]; 2056 __t.as_short[1] = __permute_selectors[__element_selector_32]; 2057 __t.as_short[2] = __permute_selectors[__element_selector_54]; 2058 __t.as_short[3] = __permute_selectors[__element_selector_76]; 2059 __pmask[0] = __t.as_m64; 2060 __a = (__v2du)__A; 2061 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask); 2062 return (__m128i)__r; 2063 } 2064 2065 extern __inline __m128i 2066 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2067 _mm_shuffle_epi32(__m128i __A, const int __mask) { 2068 unsigned long __element_selector_10 = __mask & 0x03; 2069 unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 2070 unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 2071 unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 2072 static const unsigned int __permute_selectors[4] = { 2073 #ifdef __LITTLE_ENDIAN__ 2074 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 2075 #else 2076 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 2077 #endif 2078 }; 2079 __v4su __t; 2080 2081 __t[0] = __permute_selectors[__element_selector_10]; 2082 __t[1] = __permute_selectors[__element_selector_32]; 2083 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; 2084 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; 2085 return (__m128i)vec_perm((__v4si)__A, (__v4si)__A, 2086 (__vector unsigned char)__t); 2087 } 2088 2089 extern __inline void 2090 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2091 _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) { 2092 __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; 2093 __v16qu __mask, __tmp; 2094 __m128i_u *__p = (__m128i_u *)__C; 2095 2096 __tmp = (__v16qu)_mm_loadu_si128(__p); 2097 __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit); 2098 __tmp = vec_sel(__tmp, (__v16qu)__A, __mask); 2099 _mm_storeu_si128(__p, (__m128i)__tmp); 2100 } 2101 2102 extern __inline __m128i 2103 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2104 _mm_avg_epu8(__m128i __A, __m128i __B) { 2105 return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B); 2106 } 2107 2108 extern __inline __m128i 2109 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2110 _mm_avg_epu16(__m128i __A, __m128i __B) { 2111 return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B); 2112 } 2113 2114 extern __inline __m128i 2115 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2116 _mm_sad_epu8(__m128i __A, __m128i __B) { 2117 __v16qu __a, __b; 2118 __v16qu __vabsdiff; 2119 __v4si __vsum; 2120 const __v4su __zero = {0, 0, 0, 0}; 2121 __v4si __result; 2122 2123 __a = (__v16qu)__A; 2124 __b = (__v16qu)__B; 2125 #ifndef _ARCH_PWR9 2126 __v16qu __vmin = vec_min(__a, __b); 2127 __v16qu __vmax = vec_max(__a, __b); 2128 __vabsdiff = vec_sub(__vmax, __vmin); 2129 #else 2130 __vabsdiff = vec_absd(__a, __b); 2131 #endif 2132 /* Sum four groups of bytes into integers. */ 2133 __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero); 2134 #ifdef __LITTLE_ENDIAN__ 2135 /* Sum across four integers with two integer results. */ 2136 __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero)); 2137 /* Note: vec_sum2s could be used here, but on little-endian, vector 2138 shifts are added that are not needed for this use-case. 2139 A vector shift to correctly position the 32-bit integer results 2140 (currently at [0] and [2]) to [1] and [3] would then need to be 2141 swapped back again since the desired results are two 64-bit 2142 integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */ 2143 #else 2144 /* Sum across four integers with two integer results. */ 2145 __result = vec_sum2s(__vsum, (__vector signed int)__zero); 2146 /* Rotate the sums into the correct position. */ 2147 __result = vec_sld(__result, __result, 6); 2148 #endif 2149 return (__m128i)__result; 2150 } 2151 2152 extern __inline void 2153 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2154 _mm_stream_si32(int *__A, int __B) { 2155 /* Use the data cache block touch for store transient. */ 2156 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2157 *__A = __B; 2158 } 2159 2160 extern __inline void 2161 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2162 _mm_stream_si64(long long int *__A, long long int __B) { 2163 /* Use the data cache block touch for store transient. */ 2164 __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory"); 2165 *__A = __B; 2166 } 2167 2168 extern __inline void 2169 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2170 _mm_stream_si128(__m128i *__A, __m128i __B) { 2171 /* Use the data cache block touch for store transient. */ 2172 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2173 *__A = __B; 2174 } 2175 2176 extern __inline void 2177 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2178 _mm_stream_pd(double *__A, __m128d __B) { 2179 /* Use the data cache block touch for store transient. */ 2180 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2181 *(__m128d *)__A = __B; 2182 } 2183 2184 extern __inline void 2185 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2186 _mm_clflush(void const *__A) { 2187 /* Use the data cache block flush. */ 2188 __asm__("dcbf 0,%0" : : "b"(__A) : "memory"); 2189 } 2190 2191 extern __inline void 2192 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2193 _mm_lfence(void) { 2194 /* Use light weight sync for load to load ordering. */ 2195 __atomic_thread_fence(__ATOMIC_RELEASE); 2196 } 2197 2198 extern __inline void 2199 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2200 _mm_mfence(void) { 2201 /* Use heavy weight sync for any to any ordering. */ 2202 __atomic_thread_fence(__ATOMIC_SEQ_CST); 2203 } 2204 2205 extern __inline __m128i 2206 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2207 _mm_cvtsi32_si128(int __A) { 2208 return _mm_set_epi32(0, 0, 0, __A); 2209 } 2210 2211 extern __inline __m128i 2212 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2213 _mm_cvtsi64_si128(long long __A) { 2214 return __extension__(__m128i)(__v2di){__A, 0LL}; 2215 } 2216 2217 /* Microsoft intrinsic. */ 2218 extern __inline __m128i 2219 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2220 _mm_cvtsi64x_si128(long long __A) { 2221 return __extension__(__m128i)(__v2di){__A, 0LL}; 2222 } 2223 2224 /* Casts between various SP, DP, INT vector types. Note that these do no 2225 conversion of values, they just change the type. */ 2226 extern __inline __m128 2227 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2228 _mm_castpd_ps(__m128d __A) { 2229 return (__m128)__A; 2230 } 2231 2232 extern __inline __m128i 2233 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2234 _mm_castpd_si128(__m128d __A) { 2235 return (__m128i)__A; 2236 } 2237 2238 extern __inline __m128d 2239 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2240 _mm_castps_pd(__m128 __A) { 2241 return (__m128d)__A; 2242 } 2243 2244 extern __inline __m128i 2245 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2246 _mm_castps_si128(__m128 __A) { 2247 return (__m128i)__A; 2248 } 2249 2250 extern __inline __m128 2251 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2252 _mm_castsi128_ps(__m128i __A) { 2253 return (__m128)__A; 2254 } 2255 2256 extern __inline __m128d 2257 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2258 _mm_castsi128_pd(__m128i __A) { 2259 return (__m128d)__A; 2260 } 2261 2262 #else 2263 #include_next <emmintrin.h> 2264 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ 2265 */ 2266 2267 #endif /* EMMINTRIN_H_ */ 2268