1e5dd7070Spatrick /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------=== 2e5dd7070Spatrick * 3e5dd7070Spatrick * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4e5dd7070Spatrick * See https://llvm.org/LICENSE.txt for license information. 5e5dd7070Spatrick * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6e5dd7070Spatrick * 7e5dd7070Spatrick *===-----------------------------------------------------------------------=== 8e5dd7070Spatrick */ 9e5dd7070Spatrick 10e5dd7070Spatrick /* Implemented from the specification included in the Intel C++ Compiler 11e5dd7070Spatrick User Guide and Reference, version 9.0. */ 12e5dd7070Spatrick 13e5dd7070Spatrick #ifndef NO_WARN_X86_INTRINSICS 14e5dd7070Spatrick /* This header file is to help porting code using Intel intrinsics 15e5dd7070Spatrick explicitly from x86_64 to powerpc64/powerpc64le. 16e5dd7070Spatrick 17e5dd7070Spatrick Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type, 18e5dd7070Spatrick PowerPC VMX/VSX ISA is a good match for vector float SIMD operations. 19e5dd7070Spatrick However scalar float operations in vector (XMM) registers require 20e5dd7070Spatrick the POWER8 VSX ISA (2.07) level. There are differences for data 21e5dd7070Spatrick format and placement of float scalars in the vector register, which 22e5dd7070Spatrick require extra steps to match SSE2 scalar float semantics on POWER. 23e5dd7070Spatrick 24e5dd7070Spatrick It should be noted that there's much difference between X86_64's 25e5dd7070Spatrick MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use 26e5dd7070Spatrick portable <fenv.h> instead of access MXSCR directly. 27e5dd7070Spatrick 28e5dd7070Spatrick Most SSE2 scalar float intrinsic operations can be performed more 29e5dd7070Spatrick efficiently as C language float scalar operations or optimized to 30e5dd7070Spatrick use vector SIMD operations. We recommend this for new applications. 31e5dd7070Spatrick */ 32*12c85518Srobert #error \ 33*12c85518Srobert "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 34e5dd7070Spatrick #endif 35e5dd7070Spatrick 36e5dd7070Spatrick #ifndef EMMINTRIN_H_ 37e5dd7070Spatrick #define EMMINTRIN_H_ 38e5dd7070Spatrick 39*12c85518Srobert #if defined(__powerpc64__) && \ 40*12c85518Srobert (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) 41e5dd7070Spatrick 42e5dd7070Spatrick #include <altivec.h> 43e5dd7070Spatrick 44e5dd7070Spatrick /* We need definitions from the SSE header files. */ 45e5dd7070Spatrick #include <xmmintrin.h> 46e5dd7070Spatrick 47e5dd7070Spatrick /* SSE2 */ 48e5dd7070Spatrick typedef __vector double __v2df; 49e5dd7070Spatrick typedef __vector long long __v2di; 50e5dd7070Spatrick typedef __vector unsigned long long __v2du; 51e5dd7070Spatrick typedef __vector int __v4si; 52e5dd7070Spatrick typedef __vector unsigned int __v4su; 53e5dd7070Spatrick typedef __vector short __v8hi; 54e5dd7070Spatrick typedef __vector unsigned short __v8hu; 55e5dd7070Spatrick typedef __vector signed char __v16qi; 56e5dd7070Spatrick typedef __vector unsigned char __v16qu; 57e5dd7070Spatrick 58e5dd7070Spatrick /* The Intel API is flexible enough that we must allow aliasing with other 59e5dd7070Spatrick vector types, and their scalar components. */ 60e5dd7070Spatrick typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); 61e5dd7070Spatrick typedef double __m128d __attribute__((__vector_size__(16), __may_alias__)); 62e5dd7070Spatrick 63e5dd7070Spatrick /* Unaligned version of the same types. */ 64*12c85518Srobert typedef long long __m128i_u 65*12c85518Srobert __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 66*12c85518Srobert typedef double __m128d_u 67*12c85518Srobert __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 68e5dd7070Spatrick 69e5dd7070Spatrick /* Define two value permute mask. */ 70e5dd7070Spatrick #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 71e5dd7070Spatrick 72e5dd7070Spatrick /* Create a vector with element 0 as F and the rest zero. */ 73*12c85518Srobert extern __inline __m128d 74*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_sd(double __F)75*12c85518Srobert _mm_set_sd(double __F) { 76e5dd7070Spatrick return __extension__(__m128d){__F, 0.0}; 77e5dd7070Spatrick } 78e5dd7070Spatrick 79e5dd7070Spatrick /* Create a vector with both elements equal to F. */ 80*12c85518Srobert extern __inline __m128d 81*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pd(double __F)82*12c85518Srobert _mm_set1_pd(double __F) { 83e5dd7070Spatrick return __extension__(__m128d){__F, __F}; 84e5dd7070Spatrick } 85e5dd7070Spatrick 86*12c85518Srobert extern __inline __m128d 87*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pd1(double __F)88*12c85518Srobert _mm_set_pd1(double __F) { 89e5dd7070Spatrick return _mm_set1_pd(__F); 90e5dd7070Spatrick } 91e5dd7070Spatrick 92e5dd7070Spatrick /* Create a vector with the lower value X and upper value W. */ 93*12c85518Srobert extern __inline __m128d 94*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pd(double __W,double __X)95*12c85518Srobert _mm_set_pd(double __W, double __X) { 96e5dd7070Spatrick return __extension__(__m128d){__X, __W}; 97e5dd7070Spatrick } 98e5dd7070Spatrick 99e5dd7070Spatrick /* Create a vector with the lower value W and upper value X. */ 100*12c85518Srobert extern __inline __m128d 101*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_pd(double __W,double __X)102*12c85518Srobert _mm_setr_pd(double __W, double __X) { 103e5dd7070Spatrick return __extension__(__m128d){__W, __X}; 104e5dd7070Spatrick } 105e5dd7070Spatrick 106e5dd7070Spatrick /* Create an undefined vector. */ 107*12c85518Srobert extern __inline __m128d 108*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_undefined_pd(void)109*12c85518Srobert _mm_undefined_pd(void) { 110e5dd7070Spatrick __m128d __Y = __Y; 111e5dd7070Spatrick return __Y; 112e5dd7070Spatrick } 113e5dd7070Spatrick 114e5dd7070Spatrick /* Create a vector of zeros. */ 115*12c85518Srobert extern __inline __m128d 116*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setzero_pd(void)117*12c85518Srobert _mm_setzero_pd(void) { 118e5dd7070Spatrick return (__m128d)vec_splats(0); 119e5dd7070Spatrick } 120e5dd7070Spatrick 121e5dd7070Spatrick /* Sets the low DPFP value of A from the low value of B. */ 122*12c85518Srobert extern __inline __m128d 123*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_move_sd(__m128d __A,__m128d __B)124*12c85518Srobert _mm_move_sd(__m128d __A, __m128d __B) { 125*12c85518Srobert __v2df __result = (__v2df)__A; 126*12c85518Srobert __result[0] = ((__v2df)__B)[0]; 127*12c85518Srobert return (__m128d)__result; 128e5dd7070Spatrick } 129e5dd7070Spatrick 130e5dd7070Spatrick /* Load two DPFP values from P. The address must be 16-byte aligned. */ 131*12c85518Srobert extern __inline __m128d 132*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_pd(double const * __P)133*12c85518Srobert _mm_load_pd(double const *__P) { 134e5dd7070Spatrick return ((__m128d)vec_ld(0, (__v16qu *)__P)); 135e5dd7070Spatrick } 136e5dd7070Spatrick 137e5dd7070Spatrick /* Load two DPFP values from P. The address need not be 16-byte aligned. */ 138*12c85518Srobert extern __inline __m128d 139*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadu_pd(double const * __P)140*12c85518Srobert _mm_loadu_pd(double const *__P) { 141e5dd7070Spatrick return (vec_vsx_ld(0, __P)); 142e5dd7070Spatrick } 143e5dd7070Spatrick 144e5dd7070Spatrick /* Create a vector with all two elements equal to *P. */ 145*12c85518Srobert extern __inline __m128d 146*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load1_pd(double const * __P)147*12c85518Srobert _mm_load1_pd(double const *__P) { 148e5dd7070Spatrick return (vec_splats(*__P)); 149e5dd7070Spatrick } 150e5dd7070Spatrick 151e5dd7070Spatrick /* Create a vector with element 0 as *P and the rest zero. */ 152*12c85518Srobert extern __inline __m128d 153*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_sd(double const * __P)154*12c85518Srobert _mm_load_sd(double const *__P) { 155e5dd7070Spatrick return _mm_set_sd(*__P); 156e5dd7070Spatrick } 157e5dd7070Spatrick 158*12c85518Srobert extern __inline __m128d 159*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_pd1(double const * __P)160*12c85518Srobert _mm_load_pd1(double const *__P) { 161e5dd7070Spatrick return _mm_load1_pd(__P); 162e5dd7070Spatrick } 163e5dd7070Spatrick 164e5dd7070Spatrick /* Load two DPFP values in reverse order. The address must be aligned. */ 165*12c85518Srobert extern __inline __m128d 166*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadr_pd(double const * __P)167*12c85518Srobert _mm_loadr_pd(double const *__P) { 168e5dd7070Spatrick __v2df __tmp = _mm_load_pd(__P); 169e5dd7070Spatrick return (__m128d)vec_xxpermdi(__tmp, __tmp, 2); 170e5dd7070Spatrick } 171e5dd7070Spatrick 172e5dd7070Spatrick /* Store two DPFP values. The address must be 16-byte aligned. */ 173*12c85518Srobert extern __inline void 174*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_pd(double * __P,__m128d __A)175*12c85518Srobert _mm_store_pd(double *__P, __m128d __A) { 176e5dd7070Spatrick vec_st((__v16qu)__A, 0, (__v16qu *)__P); 177e5dd7070Spatrick } 178e5dd7070Spatrick 179e5dd7070Spatrick /* Store two DPFP values. The address need not be 16-byte aligned. */ 180*12c85518Srobert extern __inline void 181*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeu_pd(double * __P,__m128d __A)182*12c85518Srobert _mm_storeu_pd(double *__P, __m128d __A) { 183e5dd7070Spatrick *(__m128d_u *)__P = __A; 184e5dd7070Spatrick } 185e5dd7070Spatrick 186e5dd7070Spatrick /* Stores the lower DPFP value. */ 187*12c85518Srobert extern __inline void 188*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_sd(double * __P,__m128d __A)189*12c85518Srobert _mm_store_sd(double *__P, __m128d __A) { 190e5dd7070Spatrick *__P = ((__v2df)__A)[0]; 191e5dd7070Spatrick } 192e5dd7070Spatrick 193*12c85518Srobert extern __inline double 194*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_f64(__m128d __A)195*12c85518Srobert _mm_cvtsd_f64(__m128d __A) { 196e5dd7070Spatrick return ((__v2df)__A)[0]; 197e5dd7070Spatrick } 198e5dd7070Spatrick 199*12c85518Srobert extern __inline void 200*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storel_pd(double * __P,__m128d __A)201*12c85518Srobert _mm_storel_pd(double *__P, __m128d __A) { 202e5dd7070Spatrick _mm_store_sd(__P, __A); 203e5dd7070Spatrick } 204e5dd7070Spatrick 205e5dd7070Spatrick /* Stores the upper DPFP value. */ 206*12c85518Srobert extern __inline void 207*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeh_pd(double * __P,__m128d __A)208*12c85518Srobert _mm_storeh_pd(double *__P, __m128d __A) { 209e5dd7070Spatrick *__P = ((__v2df)__A)[1]; 210e5dd7070Spatrick } 211e5dd7070Spatrick /* Store the lower DPFP value across two words. 212e5dd7070Spatrick The address must be 16-byte aligned. */ 213*12c85518Srobert extern __inline void 214*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store1_pd(double * __P,__m128d __A)215*12c85518Srobert _mm_store1_pd(double *__P, __m128d __A) { 216e5dd7070Spatrick _mm_store_pd(__P, vec_splat(__A, 0)); 217e5dd7070Spatrick } 218e5dd7070Spatrick 219*12c85518Srobert extern __inline void 220*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_pd1(double * __P,__m128d __A)221*12c85518Srobert _mm_store_pd1(double *__P, __m128d __A) { 222e5dd7070Spatrick _mm_store1_pd(__P, __A); 223e5dd7070Spatrick } 224e5dd7070Spatrick 225e5dd7070Spatrick /* Store two DPFP values in reverse order. The address must be aligned. */ 226*12c85518Srobert extern __inline void 227*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storer_pd(double * __P,__m128d __A)228*12c85518Srobert _mm_storer_pd(double *__P, __m128d __A) { 229e5dd7070Spatrick _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2)); 230e5dd7070Spatrick } 231e5dd7070Spatrick 232e5dd7070Spatrick /* Intel intrinsic. */ 233*12c85518Srobert extern __inline long long 234*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si64(__m128i __A)235*12c85518Srobert _mm_cvtsi128_si64(__m128i __A) { 236e5dd7070Spatrick return ((__v2di)__A)[0]; 237e5dd7070Spatrick } 238e5dd7070Spatrick 239e5dd7070Spatrick /* Microsoft intrinsic. */ 240*12c85518Srobert extern __inline long long 241*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si64x(__m128i __A)242*12c85518Srobert _mm_cvtsi128_si64x(__m128i __A) { 243e5dd7070Spatrick return ((__v2di)__A)[0]; 244e5dd7070Spatrick } 245e5dd7070Spatrick 246*12c85518Srobert extern __inline __m128d 247*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pd(__m128d __A,__m128d __B)248*12c85518Srobert _mm_add_pd(__m128d __A, __m128d __B) { 249e5dd7070Spatrick return (__m128d)((__v2df)__A + (__v2df)__B); 250e5dd7070Spatrick } 251e5dd7070Spatrick 252e5dd7070Spatrick /* Add the lower double-precision (64-bit) floating-point element in 253e5dd7070Spatrick a and b, store the result in the lower element of dst, and copy 254e5dd7070Spatrick the upper element from a to the upper element of dst. */ 255*12c85518Srobert extern __inline __m128d 256*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_sd(__m128d __A,__m128d __B)257*12c85518Srobert _mm_add_sd(__m128d __A, __m128d __B) { 258e5dd7070Spatrick __A[0] = __A[0] + __B[0]; 259e5dd7070Spatrick return (__A); 260e5dd7070Spatrick } 261e5dd7070Spatrick 262*12c85518Srobert extern __inline __m128d 263*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pd(__m128d __A,__m128d __B)264*12c85518Srobert _mm_sub_pd(__m128d __A, __m128d __B) { 265e5dd7070Spatrick return (__m128d)((__v2df)__A - (__v2df)__B); 266e5dd7070Spatrick } 267e5dd7070Spatrick 268*12c85518Srobert extern __inline __m128d 269*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_sd(__m128d __A,__m128d __B)270*12c85518Srobert _mm_sub_sd(__m128d __A, __m128d __B) { 271e5dd7070Spatrick __A[0] = __A[0] - __B[0]; 272e5dd7070Spatrick return (__A); 273e5dd7070Spatrick } 274e5dd7070Spatrick 275*12c85518Srobert extern __inline __m128d 276*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_pd(__m128d __A,__m128d __B)277*12c85518Srobert _mm_mul_pd(__m128d __A, __m128d __B) { 278e5dd7070Spatrick return (__m128d)((__v2df)__A * (__v2df)__B); 279e5dd7070Spatrick } 280e5dd7070Spatrick 281*12c85518Srobert extern __inline __m128d 282*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_sd(__m128d __A,__m128d __B)283*12c85518Srobert _mm_mul_sd(__m128d __A, __m128d __B) { 284e5dd7070Spatrick __A[0] = __A[0] * __B[0]; 285e5dd7070Spatrick return (__A); 286e5dd7070Spatrick } 287e5dd7070Spatrick 288*12c85518Srobert extern __inline __m128d 289*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_pd(__m128d __A,__m128d __B)290*12c85518Srobert _mm_div_pd(__m128d __A, __m128d __B) { 291e5dd7070Spatrick return (__m128d)((__v2df)__A / (__v2df)__B); 292e5dd7070Spatrick } 293e5dd7070Spatrick 294*12c85518Srobert extern __inline __m128d 295*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_sd(__m128d __A,__m128d __B)296*12c85518Srobert _mm_div_sd(__m128d __A, __m128d __B) { 297e5dd7070Spatrick __A[0] = __A[0] / __B[0]; 298e5dd7070Spatrick return (__A); 299e5dd7070Spatrick } 300e5dd7070Spatrick 301*12c85518Srobert extern __inline __m128d 302*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_pd(__m128d __A)303*12c85518Srobert _mm_sqrt_pd(__m128d __A) { 304e5dd7070Spatrick return (vec_sqrt(__A)); 305e5dd7070Spatrick } 306e5dd7070Spatrick 307e5dd7070Spatrick /* Return pair {sqrt (B[0]), A[1]}. */ 308*12c85518Srobert extern __inline __m128d 309*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_sd(__m128d __A,__m128d __B)310*12c85518Srobert _mm_sqrt_sd(__m128d __A, __m128d __B) { 311*12c85518Srobert __v2df __c; 312*12c85518Srobert __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0])); 313*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 314e5dd7070Spatrick } 315e5dd7070Spatrick 316*12c85518Srobert extern __inline __m128d 317*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_pd(__m128d __A,__m128d __B)318*12c85518Srobert _mm_min_pd(__m128d __A, __m128d __B) { 319e5dd7070Spatrick return (vec_min(__A, __B)); 320e5dd7070Spatrick } 321e5dd7070Spatrick 322*12c85518Srobert extern __inline __m128d 323*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_sd(__m128d __A,__m128d __B)324*12c85518Srobert _mm_min_sd(__m128d __A, __m128d __B) { 325*12c85518Srobert __v2df __a, __b, __c; 326*12c85518Srobert __a = vec_splats(__A[0]); 327*12c85518Srobert __b = vec_splats(__B[0]); 328*12c85518Srobert __c = vec_min(__a, __b); 329*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 330e5dd7070Spatrick } 331e5dd7070Spatrick 332*12c85518Srobert extern __inline __m128d 333*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_pd(__m128d __A,__m128d __B)334*12c85518Srobert _mm_max_pd(__m128d __A, __m128d __B) { 335e5dd7070Spatrick return (vec_max(__A, __B)); 336e5dd7070Spatrick } 337e5dd7070Spatrick 338*12c85518Srobert extern __inline __m128d 339*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_sd(__m128d __A,__m128d __B)340*12c85518Srobert _mm_max_sd(__m128d __A, __m128d __B) { 341*12c85518Srobert __v2df __a, __b, __c; 342*12c85518Srobert __a = vec_splats(__A[0]); 343*12c85518Srobert __b = vec_splats(__B[0]); 344*12c85518Srobert __c = vec_max(__a, __b); 345*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 346e5dd7070Spatrick } 347e5dd7070Spatrick 348*12c85518Srobert extern __inline __m128d 349*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pd(__m128d __A,__m128d __B)350*12c85518Srobert _mm_cmpeq_pd(__m128d __A, __m128d __B) { 351e5dd7070Spatrick return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B)); 352e5dd7070Spatrick } 353e5dd7070Spatrick 354*12c85518Srobert extern __inline __m128d 355*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_pd(__m128d __A,__m128d __B)356*12c85518Srobert _mm_cmplt_pd(__m128d __A, __m128d __B) { 357e5dd7070Spatrick return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B)); 358e5dd7070Spatrick } 359e5dd7070Spatrick 360*12c85518Srobert extern __inline __m128d 361*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmple_pd(__m128d __A,__m128d __B)362*12c85518Srobert _mm_cmple_pd(__m128d __A, __m128d __B) { 363e5dd7070Spatrick return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B)); 364e5dd7070Spatrick } 365e5dd7070Spatrick 366*12c85518Srobert extern __inline __m128d 367*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pd(__m128d __A,__m128d __B)368*12c85518Srobert _mm_cmpgt_pd(__m128d __A, __m128d __B) { 369e5dd7070Spatrick return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B)); 370e5dd7070Spatrick } 371e5dd7070Spatrick 372*12c85518Srobert extern __inline __m128d 373*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpge_pd(__m128d __A,__m128d __B)374*12c85518Srobert _mm_cmpge_pd(__m128d __A, __m128d __B) { 375e5dd7070Spatrick return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B)); 376e5dd7070Spatrick } 377e5dd7070Spatrick 378*12c85518Srobert extern __inline __m128d 379*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpneq_pd(__m128d __A,__m128d __B)380*12c85518Srobert _mm_cmpneq_pd(__m128d __A, __m128d __B) { 381*12c85518Srobert __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B); 382*12c85518Srobert return ((__m128d)vec_nor(__temp, __temp)); 383e5dd7070Spatrick } 384e5dd7070Spatrick 385*12c85518Srobert extern __inline __m128d 386*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnlt_pd(__m128d __A,__m128d __B)387*12c85518Srobert _mm_cmpnlt_pd(__m128d __A, __m128d __B) { 388e5dd7070Spatrick return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B)); 389e5dd7070Spatrick } 390e5dd7070Spatrick 391*12c85518Srobert extern __inline __m128d 392*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnle_pd(__m128d __A,__m128d __B)393*12c85518Srobert _mm_cmpnle_pd(__m128d __A, __m128d __B) { 394e5dd7070Spatrick return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B)); 395e5dd7070Spatrick } 396e5dd7070Spatrick 397*12c85518Srobert extern __inline __m128d 398*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpngt_pd(__m128d __A,__m128d __B)399*12c85518Srobert _mm_cmpngt_pd(__m128d __A, __m128d __B) { 400e5dd7070Spatrick return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B)); 401e5dd7070Spatrick } 402e5dd7070Spatrick 403*12c85518Srobert extern __inline __m128d 404*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnge_pd(__m128d __A,__m128d __B)405*12c85518Srobert _mm_cmpnge_pd(__m128d __A, __m128d __B) { 406e5dd7070Spatrick return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B)); 407e5dd7070Spatrick } 408e5dd7070Spatrick 409*12c85518Srobert extern __inline __m128d 410*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_pd(__m128d __A,__m128d __B)411*12c85518Srobert _mm_cmpord_pd(__m128d __A, __m128d __B) { 412*12c85518Srobert __v2du __c, __d; 413e5dd7070Spatrick /* Compare against self will return false (0's) if NAN. */ 414*12c85518Srobert __c = (__v2du)vec_cmpeq(__A, __A); 415*12c85518Srobert __d = (__v2du)vec_cmpeq(__B, __B); 416e5dd7070Spatrick /* A != NAN and B != NAN. */ 417*12c85518Srobert return ((__m128d)vec_and(__c, __d)); 418e5dd7070Spatrick } 419e5dd7070Spatrick 420*12c85518Srobert extern __inline __m128d 421*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpunord_pd(__m128d __A,__m128d __B)422*12c85518Srobert _mm_cmpunord_pd(__m128d __A, __m128d __B) { 423e5dd7070Spatrick #if _ARCH_PWR8 424*12c85518Srobert __v2du __c, __d; 425e5dd7070Spatrick /* Compare against self will return false (0's) if NAN. */ 426*12c85518Srobert __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A); 427*12c85518Srobert __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B); 428e5dd7070Spatrick /* A == NAN OR B == NAN converts too: 429e5dd7070Spatrick NOT(A != NAN) OR NOT(B != NAN). */ 430*12c85518Srobert __c = vec_nor(__c, __c); 431*12c85518Srobert return ((__m128d)vec_orc(__c, __d)); 432e5dd7070Spatrick #else 433*12c85518Srobert __v2du __c, __d; 434e5dd7070Spatrick /* Compare against self will return false (0's) if NAN. */ 435*12c85518Srobert __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A); 436*12c85518Srobert __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B); 437e5dd7070Spatrick /* Convert the true ('1's) is NAN. */ 438*12c85518Srobert __c = vec_nor(__c, __c); 439*12c85518Srobert __d = vec_nor(__d, __d); 440*12c85518Srobert return ((__m128d)vec_or(__c, __d)); 441e5dd7070Spatrick #endif 442e5dd7070Spatrick } 443e5dd7070Spatrick 444*12c85518Srobert extern __inline __m128d 445*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_sd(__m128d __A,__m128d __B)446*12c85518Srobert _mm_cmpeq_sd(__m128d __A, __m128d __B) { 447*12c85518Srobert __v2df __a, __b, __c; 448e5dd7070Spatrick /* PowerISA VSX does not allow partial (for just lower double) 449e5dd7070Spatrick results. So to insure we don't generate spurious exceptions 450e5dd7070Spatrick (from the upper double values) we splat the lower double 451e5dd7070Spatrick before we do the operation. */ 452*12c85518Srobert __a = vec_splats(__A[0]); 453*12c85518Srobert __b = vec_splats(__B[0]); 454*12c85518Srobert __c = (__v2df)vec_cmpeq(__a, __b); 455e5dd7070Spatrick /* Then we merge the lower double result with the original upper 456e5dd7070Spatrick double from __A. */ 457*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 458e5dd7070Spatrick } 459e5dd7070Spatrick 460*12c85518Srobert extern __inline __m128d 461*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_sd(__m128d __A,__m128d __B)462*12c85518Srobert _mm_cmplt_sd(__m128d __A, __m128d __B) { 463*12c85518Srobert __v2df __a, __b, __c; 464*12c85518Srobert __a = vec_splats(__A[0]); 465*12c85518Srobert __b = vec_splats(__B[0]); 466*12c85518Srobert __c = (__v2df)vec_cmplt(__a, __b); 467*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 468e5dd7070Spatrick } 469e5dd7070Spatrick 470*12c85518Srobert extern __inline __m128d 471*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmple_sd(__m128d __A,__m128d __B)472*12c85518Srobert _mm_cmple_sd(__m128d __A, __m128d __B) { 473*12c85518Srobert __v2df __a, __b, __c; 474*12c85518Srobert __a = vec_splats(__A[0]); 475*12c85518Srobert __b = vec_splats(__B[0]); 476*12c85518Srobert __c = (__v2df)vec_cmple(__a, __b); 477*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 478e5dd7070Spatrick } 479e5dd7070Spatrick 480*12c85518Srobert extern __inline __m128d 481*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_sd(__m128d __A,__m128d __B)482*12c85518Srobert _mm_cmpgt_sd(__m128d __A, __m128d __B) { 483*12c85518Srobert __v2df __a, __b, __c; 484*12c85518Srobert __a = vec_splats(__A[0]); 485*12c85518Srobert __b = vec_splats(__B[0]); 486*12c85518Srobert __c = (__v2df)vec_cmpgt(__a, __b); 487*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 488e5dd7070Spatrick } 489e5dd7070Spatrick 490*12c85518Srobert extern __inline __m128d 491*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpge_sd(__m128d __A,__m128d __B)492*12c85518Srobert _mm_cmpge_sd(__m128d __A, __m128d __B) { 493*12c85518Srobert __v2df __a, __b, __c; 494*12c85518Srobert __a = vec_splats(__A[0]); 495*12c85518Srobert __b = vec_splats(__B[0]); 496*12c85518Srobert __c = (__v2df)vec_cmpge(__a, __b); 497*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 498e5dd7070Spatrick } 499e5dd7070Spatrick 500*12c85518Srobert extern __inline __m128d 501*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpneq_sd(__m128d __A,__m128d __B)502*12c85518Srobert _mm_cmpneq_sd(__m128d __A, __m128d __B) { 503*12c85518Srobert __v2df __a, __b, __c; 504*12c85518Srobert __a = vec_splats(__A[0]); 505*12c85518Srobert __b = vec_splats(__B[0]); 506*12c85518Srobert __c = (__v2df)vec_cmpeq(__a, __b); 507*12c85518Srobert __c = vec_nor(__c, __c); 508*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 509e5dd7070Spatrick } 510e5dd7070Spatrick 511*12c85518Srobert extern __inline __m128d 512*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnlt_sd(__m128d __A,__m128d __B)513*12c85518Srobert _mm_cmpnlt_sd(__m128d __A, __m128d __B) { 514*12c85518Srobert __v2df __a, __b, __c; 515*12c85518Srobert __a = vec_splats(__A[0]); 516*12c85518Srobert __b = vec_splats(__B[0]); 517e5dd7070Spatrick /* Not less than is just greater than or equal. */ 518*12c85518Srobert __c = (__v2df)vec_cmpge(__a, __b); 519*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 520e5dd7070Spatrick } 521e5dd7070Spatrick 522*12c85518Srobert extern __inline __m128d 523*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnle_sd(__m128d __A,__m128d __B)524*12c85518Srobert _mm_cmpnle_sd(__m128d __A, __m128d __B) { 525*12c85518Srobert __v2df __a, __b, __c; 526*12c85518Srobert __a = vec_splats(__A[0]); 527*12c85518Srobert __b = vec_splats(__B[0]); 528e5dd7070Spatrick /* Not less than or equal is just greater than. */ 529*12c85518Srobert __c = (__v2df)vec_cmpge(__a, __b); 530*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 531e5dd7070Spatrick } 532e5dd7070Spatrick 533*12c85518Srobert extern __inline __m128d 534*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpngt_sd(__m128d __A,__m128d __B)535*12c85518Srobert _mm_cmpngt_sd(__m128d __A, __m128d __B) { 536*12c85518Srobert __v2df __a, __b, __c; 537*12c85518Srobert __a = vec_splats(__A[0]); 538*12c85518Srobert __b = vec_splats(__B[0]); 539e5dd7070Spatrick /* Not greater than is just less than or equal. */ 540*12c85518Srobert __c = (__v2df)vec_cmple(__a, __b); 541*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 542e5dd7070Spatrick } 543e5dd7070Spatrick 544*12c85518Srobert extern __inline __m128d 545*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnge_sd(__m128d __A,__m128d __B)546*12c85518Srobert _mm_cmpnge_sd(__m128d __A, __m128d __B) { 547*12c85518Srobert __v2df __a, __b, __c; 548*12c85518Srobert __a = vec_splats(__A[0]); 549*12c85518Srobert __b = vec_splats(__B[0]); 550e5dd7070Spatrick /* Not greater than or equal is just less than. */ 551*12c85518Srobert __c = (__v2df)vec_cmplt(__a, __b); 552*12c85518Srobert return (__m128d)_mm_setr_pd(__c[0], __A[1]); 553e5dd7070Spatrick } 554e5dd7070Spatrick 555*12c85518Srobert extern __inline __m128d 556*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_sd(__m128d __A,__m128d __B)557*12c85518Srobert _mm_cmpord_sd(__m128d __A, __m128d __B) { 558*12c85518Srobert __v2df __r; 559*12c85518Srobert __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0])); 560*12c85518Srobert return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]); 561e5dd7070Spatrick } 562e5dd7070Spatrick 563*12c85518Srobert extern __inline __m128d 564*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpunord_sd(__m128d __A,__m128d __B)565*12c85518Srobert _mm_cmpunord_sd(__m128d __A, __m128d __B) { 566*12c85518Srobert __v2df __r; 567*12c85518Srobert __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0])); 568*12c85518Srobert return (__m128d)_mm_setr_pd(__r[0], __A[1]); 569e5dd7070Spatrick } 570e5dd7070Spatrick 571e5dd7070Spatrick /* FIXME 572e5dd7070Spatrick The __mm_comi??_sd and __mm_ucomi??_sd implementations below are 573e5dd7070Spatrick exactly the same because GCC for PowerPC only generates unordered 574e5dd7070Spatrick compares (scalar and vector). 575e5dd7070Spatrick Technically __mm_comieq_sp et all should be using the ordered 576e5dd7070Spatrick compare and signal for QNaNs. The __mm_ucomieq_sd et all should 577e5dd7070Spatrick be OK. */ 578*12c85518Srobert extern __inline int 579*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comieq_sd(__m128d __A,__m128d __B)580*12c85518Srobert _mm_comieq_sd(__m128d __A, __m128d __B) { 581e5dd7070Spatrick return (__A[0] == __B[0]); 582e5dd7070Spatrick } 583e5dd7070Spatrick 584*12c85518Srobert extern __inline int 585*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comilt_sd(__m128d __A,__m128d __B)586*12c85518Srobert _mm_comilt_sd(__m128d __A, __m128d __B) { 587e5dd7070Spatrick return (__A[0] < __B[0]); 588e5dd7070Spatrick } 589e5dd7070Spatrick 590*12c85518Srobert extern __inline int 591*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comile_sd(__m128d __A,__m128d __B)592*12c85518Srobert _mm_comile_sd(__m128d __A, __m128d __B) { 593e5dd7070Spatrick return (__A[0] <= __B[0]); 594e5dd7070Spatrick } 595e5dd7070Spatrick 596*12c85518Srobert extern __inline int 597*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comigt_sd(__m128d __A,__m128d __B)598*12c85518Srobert _mm_comigt_sd(__m128d __A, __m128d __B) { 599e5dd7070Spatrick return (__A[0] > __B[0]); 600e5dd7070Spatrick } 601e5dd7070Spatrick 602*12c85518Srobert extern __inline int 603*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comige_sd(__m128d __A,__m128d __B)604*12c85518Srobert _mm_comige_sd(__m128d __A, __m128d __B) { 605e5dd7070Spatrick return (__A[0] >= __B[0]); 606e5dd7070Spatrick } 607e5dd7070Spatrick 608*12c85518Srobert extern __inline int 609*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comineq_sd(__m128d __A,__m128d __B)610*12c85518Srobert _mm_comineq_sd(__m128d __A, __m128d __B) { 611e5dd7070Spatrick return (__A[0] != __B[0]); 612e5dd7070Spatrick } 613e5dd7070Spatrick 614*12c85518Srobert extern __inline int 615*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomieq_sd(__m128d __A,__m128d __B)616*12c85518Srobert _mm_ucomieq_sd(__m128d __A, __m128d __B) { 617e5dd7070Spatrick return (__A[0] == __B[0]); 618e5dd7070Spatrick } 619e5dd7070Spatrick 620*12c85518Srobert extern __inline int 621*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomilt_sd(__m128d __A,__m128d __B)622*12c85518Srobert _mm_ucomilt_sd(__m128d __A, __m128d __B) { 623e5dd7070Spatrick return (__A[0] < __B[0]); 624e5dd7070Spatrick } 625e5dd7070Spatrick 626*12c85518Srobert extern __inline int 627*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomile_sd(__m128d __A,__m128d __B)628*12c85518Srobert _mm_ucomile_sd(__m128d __A, __m128d __B) { 629e5dd7070Spatrick return (__A[0] <= __B[0]); 630e5dd7070Spatrick } 631e5dd7070Spatrick 632*12c85518Srobert extern __inline int 633*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomigt_sd(__m128d __A,__m128d __B)634*12c85518Srobert _mm_ucomigt_sd(__m128d __A, __m128d __B) { 635e5dd7070Spatrick return (__A[0] > __B[0]); 636e5dd7070Spatrick } 637e5dd7070Spatrick 638*12c85518Srobert extern __inline int 639*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomige_sd(__m128d __A,__m128d __B)640*12c85518Srobert _mm_ucomige_sd(__m128d __A, __m128d __B) { 641e5dd7070Spatrick return (__A[0] >= __B[0]); 642e5dd7070Spatrick } 643e5dd7070Spatrick 644*12c85518Srobert extern __inline int 645*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomineq_sd(__m128d __A,__m128d __B)646*12c85518Srobert _mm_ucomineq_sd(__m128d __A, __m128d __B) { 647e5dd7070Spatrick return (__A[0] != __B[0]); 648e5dd7070Spatrick } 649e5dd7070Spatrick 650e5dd7070Spatrick /* Create a vector of Qi, where i is the element number. */ 651*12c85518Srobert extern __inline __m128i 652*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi64x(long long __q1,long long __q0)653*12c85518Srobert _mm_set_epi64x(long long __q1, long long __q0) { 654e5dd7070Spatrick return __extension__(__m128i)(__v2di){__q0, __q1}; 655e5dd7070Spatrick } 656e5dd7070Spatrick 657*12c85518Srobert extern __inline __m128i 658*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi64(__m64 __q1,__m64 __q0)659*12c85518Srobert _mm_set_epi64(__m64 __q1, __m64 __q0) { 660e5dd7070Spatrick return _mm_set_epi64x((long long)__q1, (long long)__q0); 661e5dd7070Spatrick } 662e5dd7070Spatrick 663*12c85518Srobert extern __inline __m128i 664*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi32(int __q3,int __q2,int __q1,int __q0)665*12c85518Srobert _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) { 666e5dd7070Spatrick return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3}; 667e5dd7070Spatrick } 668e5dd7070Spatrick 669*12c85518Srobert extern __inline __m128i 670*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi16(short __q7,short __q6,short __q5,short __q4,short __q3,short __q2,short __q1,short __q0)671*12c85518Srobert _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3, 672*12c85518Srobert short __q2, short __q1, short __q0) { 673*12c85518Srobert return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3, 674*12c85518Srobert __q4, __q5, __q6, __q7}; 675e5dd7070Spatrick } 676e5dd7070Spatrick 677*12c85518Srobert extern __inline __m128i 678*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi8(char __q15,char __q14,char __q13,char __q12,char __q11,char __q10,char __q09,char __q08,char __q07,char __q06,char __q05,char __q04,char __q03,char __q02,char __q01,char __q00)679*12c85518Srobert _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11, 680*12c85518Srobert char __q10, char __q09, char __q08, char __q07, char __q06, 681*12c85518Srobert char __q05, char __q04, char __q03, char __q02, char __q01, 682*12c85518Srobert char __q00) { 683e5dd7070Spatrick return __extension__(__m128i)(__v16qi){ 684e5dd7070Spatrick __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 685*12c85518Srobert __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; 686e5dd7070Spatrick } 687e5dd7070Spatrick 688e5dd7070Spatrick /* Set all of the elements of the vector to A. */ 689*12c85518Srobert extern __inline __m128i 690*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_epi64x(long long __A)691*12c85518Srobert _mm_set1_epi64x(long long __A) { 692e5dd7070Spatrick return _mm_set_epi64x(__A, __A); 693e5dd7070Spatrick } 694e5dd7070Spatrick 695*12c85518Srobert extern __inline __m128i 696*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_epi64(__m64 __A)697*12c85518Srobert _mm_set1_epi64(__m64 __A) { 698e5dd7070Spatrick return _mm_set_epi64(__A, __A); 699e5dd7070Spatrick } 700e5dd7070Spatrick 701*12c85518Srobert extern __inline __m128i 702*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_epi32(int __A)703*12c85518Srobert _mm_set1_epi32(int __A) { 704e5dd7070Spatrick return _mm_set_epi32(__A, __A, __A, __A); 705e5dd7070Spatrick } 706e5dd7070Spatrick 707*12c85518Srobert extern __inline __m128i 708*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_epi16(short __A)709*12c85518Srobert _mm_set1_epi16(short __A) { 710e5dd7070Spatrick return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A); 711e5dd7070Spatrick } 712e5dd7070Spatrick 713*12c85518Srobert extern __inline __m128i 714*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_epi8(char __A)715*12c85518Srobert _mm_set1_epi8(char __A) { 716*12c85518Srobert return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, 717*12c85518Srobert __A, __A, __A, __A, __A); 718e5dd7070Spatrick } 719e5dd7070Spatrick 720e5dd7070Spatrick /* Create a vector of Qi, where i is the element number. 721e5dd7070Spatrick The parameter order is reversed from the _mm_set_epi* functions. */ 722*12c85518Srobert extern __inline __m128i 723*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_epi64(__m64 __q0,__m64 __q1)724*12c85518Srobert _mm_setr_epi64(__m64 __q0, __m64 __q1) { 725e5dd7070Spatrick return _mm_set_epi64(__q1, __q0); 726e5dd7070Spatrick } 727e5dd7070Spatrick 728*12c85518Srobert extern __inline __m128i 729*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_epi32(int __q0,int __q1,int __q2,int __q3)730*12c85518Srobert _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) { 731e5dd7070Spatrick return _mm_set_epi32(__q3, __q2, __q1, __q0); 732e5dd7070Spatrick } 733e5dd7070Spatrick 734*12c85518Srobert extern __inline __m128i 735*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_epi16(short __q0,short __q1,short __q2,short __q3,short __q4,short __q5,short __q6,short __q7)736*12c85518Srobert _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4, 737*12c85518Srobert short __q5, short __q6, short __q7) { 738e5dd7070Spatrick return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 739e5dd7070Spatrick } 740e5dd7070Spatrick 741*12c85518Srobert extern __inline __m128i 742*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_epi8(char __q00,char __q01,char __q02,char __q03,char __q04,char __q05,char __q06,char __q07,char __q08,char __q09,char __q10,char __q11,char __q12,char __q13,char __q14,char __q15)743*12c85518Srobert _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04, 744*12c85518Srobert char __q05, char __q06, char __q07, char __q08, char __q09, 745*12c85518Srobert char __q10, char __q11, char __q12, char __q13, char __q14, 746*12c85518Srobert char __q15) { 747e5dd7070Spatrick return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 748e5dd7070Spatrick __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 749e5dd7070Spatrick } 750e5dd7070Spatrick 751e5dd7070Spatrick /* Create a vector with element 0 as *P and the rest zero. */ 752*12c85518Srobert extern __inline __m128i 753*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_si128(__m128i const * __P)754*12c85518Srobert _mm_load_si128(__m128i const *__P) { 755e5dd7070Spatrick return *__P; 756e5dd7070Spatrick } 757e5dd7070Spatrick 758*12c85518Srobert extern __inline __m128i 759*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadu_si128(__m128i_u const * __P)760*12c85518Srobert _mm_loadu_si128(__m128i_u const *__P) { 761e5dd7070Spatrick return (__m128i)(vec_vsx_ld(0, (signed int const *)__P)); 762e5dd7070Spatrick } 763e5dd7070Spatrick 764*12c85518Srobert extern __inline __m128i 765*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadl_epi64(__m128i_u const * __P)766*12c85518Srobert _mm_loadl_epi64(__m128i_u const *__P) { 767e5dd7070Spatrick return _mm_set_epi64((__m64)0LL, *(__m64 *)__P); 768e5dd7070Spatrick } 769e5dd7070Spatrick 770*12c85518Srobert extern __inline void 771*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_si128(__m128i * __P,__m128i __B)772*12c85518Srobert _mm_store_si128(__m128i *__P, __m128i __B) { 773e5dd7070Spatrick vec_st((__v16qu)__B, 0, (__v16qu *)__P); 774e5dd7070Spatrick } 775e5dd7070Spatrick 776*12c85518Srobert extern __inline void 777*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeu_si128(__m128i_u * __P,__m128i __B)778*12c85518Srobert _mm_storeu_si128(__m128i_u *__P, __m128i __B) { 779e5dd7070Spatrick *__P = __B; 780e5dd7070Spatrick } 781e5dd7070Spatrick 782*12c85518Srobert extern __inline void 783*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storel_epi64(__m128i_u * __P,__m128i __B)784*12c85518Srobert _mm_storel_epi64(__m128i_u *__P, __m128i __B) { 785e5dd7070Spatrick *(long long *)__P = ((__v2di)__B)[0]; 786e5dd7070Spatrick } 787e5dd7070Spatrick 788*12c85518Srobert extern __inline __m64 789*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movepi64_pi64(__m128i_u __B)790*12c85518Srobert _mm_movepi64_pi64(__m128i_u __B) { 791e5dd7070Spatrick return (__m64)((__v2di)__B)[0]; 792e5dd7070Spatrick } 793e5dd7070Spatrick 794*12c85518Srobert extern __inline __m128i 795*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movpi64_epi64(__m64 __A)796*12c85518Srobert _mm_movpi64_epi64(__m64 __A) { 797e5dd7070Spatrick return _mm_set_epi64((__m64)0LL, __A); 798e5dd7070Spatrick } 799e5dd7070Spatrick 800*12c85518Srobert extern __inline __m128i 801*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_move_epi64(__m128i __A)802*12c85518Srobert _mm_move_epi64(__m128i __A) { 803e5dd7070Spatrick return _mm_set_epi64((__m64)0LL, (__m64)__A[0]); 804e5dd7070Spatrick } 805e5dd7070Spatrick 806e5dd7070Spatrick /* Create an undefined vector. */ 807*12c85518Srobert extern __inline __m128i 808*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_undefined_si128(void)809*12c85518Srobert _mm_undefined_si128(void) { 810e5dd7070Spatrick __m128i __Y = __Y; 811e5dd7070Spatrick return __Y; 812e5dd7070Spatrick } 813e5dd7070Spatrick 814e5dd7070Spatrick /* Create a vector of zeros. */ 815*12c85518Srobert extern __inline __m128i 816*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setzero_si128(void)817*12c85518Srobert _mm_setzero_si128(void) { 818e5dd7070Spatrick return __extension__(__m128i)(__v4si){0, 0, 0, 0}; 819e5dd7070Spatrick } 820e5dd7070Spatrick 821e5dd7070Spatrick #ifdef _ARCH_PWR8 822*12c85518Srobert extern __inline __m128d 823*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtepi32_pd(__m128i __A)824*12c85518Srobert _mm_cvtepi32_pd(__m128i __A) { 825*12c85518Srobert __v2di __val; 826e5dd7070Spatrick /* For LE need to generate Vector Unpack Low Signed Word. 827e5dd7070Spatrick Which is generated from unpackh. */ 828*12c85518Srobert __val = (__v2di)vec_unpackh((__v4si)__A); 829e5dd7070Spatrick 830*12c85518Srobert return (__m128d)vec_ctf(__val, 0); 831e5dd7070Spatrick } 832e5dd7070Spatrick #endif 833e5dd7070Spatrick 834*12c85518Srobert extern __inline __m128 835*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtepi32_ps(__m128i __A)836*12c85518Srobert _mm_cvtepi32_ps(__m128i __A) { 837e5dd7070Spatrick return ((__m128)vec_ctf((__v4si)__A, 0)); 838e5dd7070Spatrick } 839e5dd7070Spatrick 840*12c85518Srobert extern __inline __m128i 841*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpd_epi32(__m128d __A)842*12c85518Srobert _mm_cvtpd_epi32(__m128d __A) { 843*12c85518Srobert __v2df __rounded = vec_rint(__A); 844*12c85518Srobert __v4si __result, __temp; 845*12c85518Srobert const __v4si __vzero = {0, 0, 0, 0}; 846e5dd7070Spatrick 847e5dd7070Spatrick /* VSX Vector truncate Double-Precision to integer and Convert to 848e5dd7070Spatrick Signed Integer Word format with Saturate. */ 849*12c85518Srobert __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :); 850e5dd7070Spatrick 851e5dd7070Spatrick #ifdef _ARCH_PWR8 852*12c85518Srobert #ifdef __LITTLE_ENDIAN__ 853*12c85518Srobert __temp = vec_mergeo(__temp, __temp); 854*12c85518Srobert #else 855*12c85518Srobert __temp = vec_mergee(__temp, __temp); 856*12c85518Srobert #endif 857*12c85518Srobert __result = (__v4si)vec_vpkudum((__vector long long)__temp, 858*12c85518Srobert (__vector long long)__vzero); 859e5dd7070Spatrick #else 860e5dd7070Spatrick { 861*12c85518Srobert const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 862e5dd7070Spatrick 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 863*12c85518Srobert __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 864e5dd7070Spatrick } 865e5dd7070Spatrick #endif 866*12c85518Srobert return (__m128i)__result; 867e5dd7070Spatrick } 868e5dd7070Spatrick 869*12c85518Srobert extern __inline __m64 870*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpd_pi32(__m128d __A)871*12c85518Srobert _mm_cvtpd_pi32(__m128d __A) { 872*12c85518Srobert __m128i __result = _mm_cvtpd_epi32(__A); 873e5dd7070Spatrick 874*12c85518Srobert return (__m64)__result[0]; 875e5dd7070Spatrick } 876e5dd7070Spatrick 877*12c85518Srobert extern __inline __m128 878*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpd_ps(__m128d __A)879*12c85518Srobert _mm_cvtpd_ps(__m128d __A) { 880*12c85518Srobert __v4sf __result; 881*12c85518Srobert __v4si __temp; 882*12c85518Srobert const __v4si __vzero = {0, 0, 0, 0}; 883e5dd7070Spatrick 884*12c85518Srobert __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :); 885e5dd7070Spatrick 886e5dd7070Spatrick #ifdef _ARCH_PWR8 887*12c85518Srobert #ifdef __LITTLE_ENDIAN__ 888*12c85518Srobert __temp = vec_mergeo(__temp, __temp); 889*12c85518Srobert #else 890*12c85518Srobert __temp = vec_mergee(__temp, __temp); 891*12c85518Srobert #endif 892*12c85518Srobert __result = (__v4sf)vec_vpkudum((__vector long long)__temp, 893*12c85518Srobert (__vector long long)__vzero); 894e5dd7070Spatrick #else 895e5dd7070Spatrick { 896*12c85518Srobert const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 897e5dd7070Spatrick 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 898*12c85518Srobert __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 899e5dd7070Spatrick } 900e5dd7070Spatrick #endif 901*12c85518Srobert return ((__m128)__result); 902e5dd7070Spatrick } 903e5dd7070Spatrick 904*12c85518Srobert extern __inline __m128i 905*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttpd_epi32(__m128d __A)906*12c85518Srobert _mm_cvttpd_epi32(__m128d __A) { 907*12c85518Srobert __v4si __result; 908*12c85518Srobert __v4si __temp; 909*12c85518Srobert const __v4si __vzero = {0, 0, 0, 0}; 910e5dd7070Spatrick 911e5dd7070Spatrick /* VSX Vector truncate Double-Precision to integer and Convert to 912e5dd7070Spatrick Signed Integer Word format with Saturate. */ 913*12c85518Srobert __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :); 914e5dd7070Spatrick 915e5dd7070Spatrick #ifdef _ARCH_PWR8 916*12c85518Srobert #ifdef __LITTLE_ENDIAN__ 917*12c85518Srobert __temp = vec_mergeo(__temp, __temp); 918*12c85518Srobert #else 919*12c85518Srobert __temp = vec_mergee(__temp, __temp); 920*12c85518Srobert #endif 921*12c85518Srobert __result = (__v4si)vec_vpkudum((__vector long long)__temp, 922*12c85518Srobert (__vector long long)__vzero); 923e5dd7070Spatrick #else 924e5dd7070Spatrick { 925*12c85518Srobert const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 926e5dd7070Spatrick 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 927*12c85518Srobert __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 928e5dd7070Spatrick } 929e5dd7070Spatrick #endif 930e5dd7070Spatrick 931*12c85518Srobert return ((__m128i)__result); 932e5dd7070Spatrick } 933e5dd7070Spatrick 934*12c85518Srobert extern __inline __m64 935*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttpd_pi32(__m128d __A)936*12c85518Srobert _mm_cvttpd_pi32(__m128d __A) { 937*12c85518Srobert __m128i __result = _mm_cvttpd_epi32(__A); 938e5dd7070Spatrick 939*12c85518Srobert return (__m64)__result[0]; 940e5dd7070Spatrick } 941e5dd7070Spatrick 942*12c85518Srobert extern __inline int 943*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si32(__m128i __A)944*12c85518Srobert _mm_cvtsi128_si32(__m128i __A) { 945e5dd7070Spatrick return ((__v4si)__A)[0]; 946e5dd7070Spatrick } 947e5dd7070Spatrick 948e5dd7070Spatrick #ifdef _ARCH_PWR8 949*12c85518Srobert extern __inline __m128d 950*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpi32_pd(__m64 __A)951*12c85518Srobert _mm_cvtpi32_pd(__m64 __A) { 952*12c85518Srobert __v4si __temp; 953*12c85518Srobert __v2di __tmp2; 954*12c85518Srobert __v2df __result; 955e5dd7070Spatrick 956*12c85518Srobert __temp = (__v4si)vec_splats(__A); 957*12c85518Srobert __tmp2 = (__v2di)vec_unpackl(__temp); 958*12c85518Srobert __result = vec_ctf((__vector signed long long)__tmp2, 0); 959*12c85518Srobert return (__m128d)__result; 960e5dd7070Spatrick } 961e5dd7070Spatrick #endif 962e5dd7070Spatrick 963*12c85518Srobert extern __inline __m128i 964*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtps_epi32(__m128 __A)965*12c85518Srobert _mm_cvtps_epi32(__m128 __A) { 966*12c85518Srobert __v4sf __rounded; 967*12c85518Srobert __v4si __result; 968e5dd7070Spatrick 969*12c85518Srobert __rounded = vec_rint((__v4sf)__A); 970*12c85518Srobert __result = vec_cts(__rounded, 0); 971*12c85518Srobert return (__m128i)__result; 972e5dd7070Spatrick } 973e5dd7070Spatrick 974*12c85518Srobert extern __inline __m128i 975*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttps_epi32(__m128 __A)976*12c85518Srobert _mm_cvttps_epi32(__m128 __A) { 977*12c85518Srobert __v4si __result; 978e5dd7070Spatrick 979*12c85518Srobert __result = vec_cts((__v4sf)__A, 0); 980*12c85518Srobert return (__m128i)__result; 981e5dd7070Spatrick } 982e5dd7070Spatrick 983*12c85518Srobert extern __inline __m128d 984*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtps_pd(__m128 __A)985*12c85518Srobert _mm_cvtps_pd(__m128 __A) { 986e5dd7070Spatrick /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ 987e5dd7070Spatrick #ifdef vec_doubleh 988e5dd7070Spatrick return (__m128d)vec_doubleh((__v4sf)__A); 989e5dd7070Spatrick #else 990e5dd7070Spatrick /* Otherwise the compiler is not current and so need to generate the 991e5dd7070Spatrick equivalent code. */ 992*12c85518Srobert __v4sf __a = (__v4sf)__A; 993*12c85518Srobert __v4sf __temp; 994*12c85518Srobert __v2df __result; 995e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 996e5dd7070Spatrick /* The input float values are in elements {[0], [1]} but the convert 997e5dd7070Spatrick instruction needs them in elements {[1], [3]}, So we use two 998e5dd7070Spatrick shift left double vector word immediates to get the elements 999e5dd7070Spatrick lined up. */ 1000*12c85518Srobert __temp = __builtin_vsx_xxsldwi(__a, __a, 3); 1001*12c85518Srobert __temp = __builtin_vsx_xxsldwi(__a, __temp, 2); 1002e5dd7070Spatrick #else 1003e5dd7070Spatrick /* The input float values are in elements {[0], [1]} but the convert 1004e5dd7070Spatrick instruction needs them in elements {[0], [2]}, So we use two 1005e5dd7070Spatrick shift left double vector word immediates to get the elements 1006e5dd7070Spatrick lined up. */ 1007*12c85518Srobert __temp = vec_vmrghw(__a, __a); 1008e5dd7070Spatrick #endif 1009*12c85518Srobert __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :); 1010*12c85518Srobert return (__m128d)__result; 1011e5dd7070Spatrick #endif 1012e5dd7070Spatrick } 1013e5dd7070Spatrick 1014*12c85518Srobert extern __inline int 1015*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_si32(__m128d __A)1016*12c85518Srobert _mm_cvtsd_si32(__m128d __A) { 1017*12c85518Srobert __v2df __rounded = vec_rint((__v2df)__A); 1018*12c85518Srobert int __result = ((__v2df)__rounded)[0]; 1019e5dd7070Spatrick 1020*12c85518Srobert return __result; 1021e5dd7070Spatrick } 1022e5dd7070Spatrick /* Intel intrinsic. */ 1023*12c85518Srobert extern __inline long long 1024*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_si64(__m128d __A)1025*12c85518Srobert _mm_cvtsd_si64(__m128d __A) { 1026*12c85518Srobert __v2df __rounded = vec_rint((__v2df)__A); 1027*12c85518Srobert long long __result = ((__v2df)__rounded)[0]; 1028e5dd7070Spatrick 1029*12c85518Srobert return __result; 1030e5dd7070Spatrick } 1031e5dd7070Spatrick 1032e5dd7070Spatrick /* Microsoft intrinsic. */ 1033*12c85518Srobert extern __inline long long 1034*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_si64x(__m128d __A)1035*12c85518Srobert _mm_cvtsd_si64x(__m128d __A) { 1036e5dd7070Spatrick return _mm_cvtsd_si64((__v2df)__A); 1037e5dd7070Spatrick } 1038e5dd7070Spatrick 1039*12c85518Srobert extern __inline int 1040*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttsd_si32(__m128d __A)1041*12c85518Srobert _mm_cvttsd_si32(__m128d __A) { 1042*12c85518Srobert int __result = ((__v2df)__A)[0]; 1043e5dd7070Spatrick 1044*12c85518Srobert return __result; 1045e5dd7070Spatrick } 1046e5dd7070Spatrick 1047e5dd7070Spatrick /* Intel intrinsic. */ 1048*12c85518Srobert extern __inline long long 1049*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttsd_si64(__m128d __A)1050*12c85518Srobert _mm_cvttsd_si64(__m128d __A) { 1051*12c85518Srobert long long __result = ((__v2df)__A)[0]; 1052e5dd7070Spatrick 1053*12c85518Srobert return __result; 1054e5dd7070Spatrick } 1055e5dd7070Spatrick 1056e5dd7070Spatrick /* Microsoft intrinsic. */ 1057*12c85518Srobert extern __inline long long 1058*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttsd_si64x(__m128d __A)1059*12c85518Srobert _mm_cvttsd_si64x(__m128d __A) { 1060e5dd7070Spatrick return _mm_cvttsd_si64(__A); 1061e5dd7070Spatrick } 1062e5dd7070Spatrick 1063*12c85518Srobert extern __inline __m128 1064*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_ss(__m128 __A,__m128d __B)1065*12c85518Srobert _mm_cvtsd_ss(__m128 __A, __m128d __B) { 1066*12c85518Srobert __v4sf __result = (__v4sf)__A; 1067e5dd7070Spatrick 1068e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1069*12c85518Srobert __v4sf __temp_s; 1070e5dd7070Spatrick /* Copy double element[0] to element [1] for conversion. */ 1071*12c85518Srobert __v2df __temp_b = vec_splat((__v2df)__B, 0); 1072e5dd7070Spatrick 1073e5dd7070Spatrick /* Pre-rotate __A left 3 (logically right 1) elements. */ 1074*12c85518Srobert __result = __builtin_vsx_xxsldwi(__result, __result, 3); 1075e5dd7070Spatrick /* Convert double to single float scalar in a vector. */ 1076*12c85518Srobert __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :); 1077e5dd7070Spatrick /* Shift the resulting scalar into vector element [0]. */ 1078*12c85518Srobert __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1); 1079e5dd7070Spatrick #else 1080*12c85518Srobert __result[0] = ((__v2df)__B)[0]; 1081e5dd7070Spatrick #endif 1082*12c85518Srobert return (__m128)__result; 1083e5dd7070Spatrick } 1084e5dd7070Spatrick 1085*12c85518Srobert extern __inline __m128d 1086*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi32_sd(__m128d __A,int __B)1087*12c85518Srobert _mm_cvtsi32_sd(__m128d __A, int __B) { 1088*12c85518Srobert __v2df __result = (__v2df)__A; 1089*12c85518Srobert double __db = __B; 1090*12c85518Srobert __result[0] = __db; 1091*12c85518Srobert return (__m128d)__result; 1092e5dd7070Spatrick } 1093e5dd7070Spatrick 1094e5dd7070Spatrick /* Intel intrinsic. */ 1095*12c85518Srobert extern __inline __m128d 1096*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_sd(__m128d __A,long long __B)1097*12c85518Srobert _mm_cvtsi64_sd(__m128d __A, long long __B) { 1098*12c85518Srobert __v2df __result = (__v2df)__A; 1099*12c85518Srobert double __db = __B; 1100*12c85518Srobert __result[0] = __db; 1101*12c85518Srobert return (__m128d)__result; 1102e5dd7070Spatrick } 1103e5dd7070Spatrick 1104e5dd7070Spatrick /* Microsoft intrinsic. */ 1105*12c85518Srobert extern __inline __m128d 1106*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64x_sd(__m128d __A,long long __B)1107*12c85518Srobert _mm_cvtsi64x_sd(__m128d __A, long long __B) { 1108e5dd7070Spatrick return _mm_cvtsi64_sd(__A, __B); 1109e5dd7070Spatrick } 1110e5dd7070Spatrick 1111*12c85518Srobert extern __inline __m128d 1112*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_sd(__m128d __A,__m128 __B)1113*12c85518Srobert _mm_cvtss_sd(__m128d __A, __m128 __B) { 1114e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1115e5dd7070Spatrick /* Use splat to move element [0] into position for the convert. */ 1116*12c85518Srobert __v4sf __temp = vec_splat((__v4sf)__B, 0); 1117*12c85518Srobert __v2df __res; 1118e5dd7070Spatrick /* Convert single float scalar to double in a vector. */ 1119*12c85518Srobert __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :); 1120*12c85518Srobert return (__m128d)vec_mergel(__res, (__v2df)__A); 1121e5dd7070Spatrick #else 1122*12c85518Srobert __v2df __res = (__v2df)__A; 1123*12c85518Srobert __res[0] = ((__v4sf)__B)[0]; 1124*12c85518Srobert return (__m128d)__res; 1125e5dd7070Spatrick #endif 1126e5dd7070Spatrick } 1127e5dd7070Spatrick 1128*12c85518Srobert extern __inline __m128d 1129*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shuffle_pd(__m128d __A,__m128d __B,const int __mask)1130*12c85518Srobert _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) { 1131*12c85518Srobert __vector double __result; 1132*12c85518Srobert const int __litmsk = __mask & 0x3; 1133e5dd7070Spatrick 1134*12c85518Srobert if (__litmsk == 0) 1135*12c85518Srobert __result = vec_mergeh(__A, __B); 1136e5dd7070Spatrick #if __GNUC__ < 6 1137*12c85518Srobert else if (__litmsk == 1) 1138*12c85518Srobert __result = vec_xxpermdi(__B, __A, 2); 1139*12c85518Srobert else if (__litmsk == 2) 1140*12c85518Srobert __result = vec_xxpermdi(__B, __A, 1); 1141e5dd7070Spatrick #else 1142*12c85518Srobert else if (__litmsk == 1) 1143*12c85518Srobert __result = vec_xxpermdi(__A, __B, 2); 1144*12c85518Srobert else if (__litmsk == 2) 1145*12c85518Srobert __result = vec_xxpermdi(__A, __B, 1); 1146e5dd7070Spatrick #endif 1147e5dd7070Spatrick else 1148*12c85518Srobert __result = vec_mergel(__A, __B); 1149e5dd7070Spatrick 1150*12c85518Srobert return __result; 1151e5dd7070Spatrick } 1152e5dd7070Spatrick 1153*12c85518Srobert extern __inline __m128d 1154*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pd(__m128d __A,__m128d __B)1155*12c85518Srobert _mm_unpackhi_pd(__m128d __A, __m128d __B) { 1156e5dd7070Spatrick return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B); 1157e5dd7070Spatrick } 1158e5dd7070Spatrick 1159*12c85518Srobert extern __inline __m128d 1160*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pd(__m128d __A,__m128d __B)1161*12c85518Srobert _mm_unpacklo_pd(__m128d __A, __m128d __B) { 1162e5dd7070Spatrick return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B); 1163e5dd7070Spatrick } 1164e5dd7070Spatrick 1165*12c85518Srobert extern __inline __m128d 1166*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadh_pd(__m128d __A,double const * __B)1167*12c85518Srobert _mm_loadh_pd(__m128d __A, double const *__B) { 1168*12c85518Srobert __v2df __result = (__v2df)__A; 1169*12c85518Srobert __result[1] = *__B; 1170*12c85518Srobert return (__m128d)__result; 1171e5dd7070Spatrick } 1172e5dd7070Spatrick 1173*12c85518Srobert extern __inline __m128d 1174*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadl_pd(__m128d __A,double const * __B)1175*12c85518Srobert _mm_loadl_pd(__m128d __A, double const *__B) { 1176*12c85518Srobert __v2df __result = (__v2df)__A; 1177*12c85518Srobert __result[0] = *__B; 1178*12c85518Srobert return (__m128d)__result; 1179e5dd7070Spatrick } 1180e5dd7070Spatrick 1181e5dd7070Spatrick #ifdef _ARCH_PWR8 1182e5dd7070Spatrick /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1183e5dd7070Spatrick 1184e5dd7070Spatrick /* Creates a 2-bit mask from the most significant bits of the DPFP values. */ 1185*12c85518Srobert extern __inline int 1186*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pd(__m128d __A)1187*12c85518Srobert _mm_movemask_pd(__m128d __A) { 1188*12c85518Srobert #ifdef _ARCH_PWR10 1189*12c85518Srobert return vec_extractm((__v2du)__A); 1190*12c85518Srobert #else 1191*12c85518Srobert __vector unsigned long long __result; 1192*12c85518Srobert static const __vector unsigned int __perm_mask = { 1193e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1194e5dd7070Spatrick 0x80800040, 0x80808080, 0x80808080, 0x80808080 1195e5dd7070Spatrick #else 1196e5dd7070Spatrick 0x80808080, 0x80808080, 0x80808080, 0x80804000 1197e5dd7070Spatrick #endif 1198e5dd7070Spatrick }; 1199e5dd7070Spatrick 1200*12c85518Srobert __result = ((__vector unsigned long long)vec_vbpermq( 1201*12c85518Srobert (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); 1202e5dd7070Spatrick 1203e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1204*12c85518Srobert return __result[1]; 1205e5dd7070Spatrick #else 1206*12c85518Srobert return __result[0]; 1207e5dd7070Spatrick #endif 1208*12c85518Srobert #endif /* !_ARCH_PWR10 */ 1209e5dd7070Spatrick } 1210e5dd7070Spatrick #endif /* _ARCH_PWR8 */ 1211e5dd7070Spatrick 1212*12c85518Srobert extern __inline __m128i 1213*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_epi16(__m128i __A,__m128i __B)1214*12c85518Srobert _mm_packs_epi16(__m128i __A, __m128i __B) { 1215e5dd7070Spatrick return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B); 1216e5dd7070Spatrick } 1217e5dd7070Spatrick 1218*12c85518Srobert extern __inline __m128i 1219*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_epi32(__m128i __A,__m128i __B)1220*12c85518Srobert _mm_packs_epi32(__m128i __A, __m128i __B) { 1221e5dd7070Spatrick return (__m128i)vec_packs((__v4si)__A, (__v4si)__B); 1222e5dd7070Spatrick } 1223e5dd7070Spatrick 1224*12c85518Srobert extern __inline __m128i 1225*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packus_epi16(__m128i __A,__m128i __B)1226*12c85518Srobert _mm_packus_epi16(__m128i __A, __m128i __B) { 1227e5dd7070Spatrick return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B); 1228e5dd7070Spatrick } 1229e5dd7070Spatrick 1230*12c85518Srobert extern __inline __m128i 1231*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_epi8(__m128i __A,__m128i __B)1232*12c85518Srobert _mm_unpackhi_epi8(__m128i __A, __m128i __B) { 1233e5dd7070Spatrick return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B); 1234e5dd7070Spatrick } 1235e5dd7070Spatrick 1236*12c85518Srobert extern __inline __m128i 1237*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_epi16(__m128i __A,__m128i __B)1238*12c85518Srobert _mm_unpackhi_epi16(__m128i __A, __m128i __B) { 1239e5dd7070Spatrick return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B); 1240e5dd7070Spatrick } 1241e5dd7070Spatrick 1242*12c85518Srobert extern __inline __m128i 1243*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_epi32(__m128i __A,__m128i __B)1244*12c85518Srobert _mm_unpackhi_epi32(__m128i __A, __m128i __B) { 1245e5dd7070Spatrick return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B); 1246e5dd7070Spatrick } 1247e5dd7070Spatrick 1248*12c85518Srobert extern __inline __m128i 1249*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_epi64(__m128i __A,__m128i __B)1250*12c85518Srobert _mm_unpackhi_epi64(__m128i __A, __m128i __B) { 1251*12c85518Srobert return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B); 1252e5dd7070Spatrick } 1253e5dd7070Spatrick 1254*12c85518Srobert extern __inline __m128i 1255*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_epi8(__m128i __A,__m128i __B)1256*12c85518Srobert _mm_unpacklo_epi8(__m128i __A, __m128i __B) { 1257e5dd7070Spatrick return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B); 1258e5dd7070Spatrick } 1259e5dd7070Spatrick 1260*12c85518Srobert extern __inline __m128i 1261*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_epi16(__m128i __A,__m128i __B)1262*12c85518Srobert _mm_unpacklo_epi16(__m128i __A, __m128i __B) { 1263e5dd7070Spatrick return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B); 1264e5dd7070Spatrick } 1265e5dd7070Spatrick 1266*12c85518Srobert extern __inline __m128i 1267*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_epi32(__m128i __A,__m128i __B)1268*12c85518Srobert _mm_unpacklo_epi32(__m128i __A, __m128i __B) { 1269e5dd7070Spatrick return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B); 1270e5dd7070Spatrick } 1271e5dd7070Spatrick 1272*12c85518Srobert extern __inline __m128i 1273*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_epi64(__m128i __A,__m128i __B)1274*12c85518Srobert _mm_unpacklo_epi64(__m128i __A, __m128i __B) { 1275*12c85518Srobert return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B); 1276e5dd7070Spatrick } 1277e5dd7070Spatrick 1278*12c85518Srobert extern __inline __m128i 1279*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi8(__m128i __A,__m128i __B)1280*12c85518Srobert _mm_add_epi8(__m128i __A, __m128i __B) { 1281e5dd7070Spatrick return (__m128i)((__v16qu)__A + (__v16qu)__B); 1282e5dd7070Spatrick } 1283e5dd7070Spatrick 1284*12c85518Srobert extern __inline __m128i 1285*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi16(__m128i __A,__m128i __B)1286*12c85518Srobert _mm_add_epi16(__m128i __A, __m128i __B) { 1287e5dd7070Spatrick return (__m128i)((__v8hu)__A + (__v8hu)__B); 1288e5dd7070Spatrick } 1289e5dd7070Spatrick 1290*12c85518Srobert extern __inline __m128i 1291*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi32(__m128i __A,__m128i __B)1292*12c85518Srobert _mm_add_epi32(__m128i __A, __m128i __B) { 1293e5dd7070Spatrick return (__m128i)((__v4su)__A + (__v4su)__B); 1294e5dd7070Spatrick } 1295e5dd7070Spatrick 1296*12c85518Srobert extern __inline __m128i 1297*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi64(__m128i __A,__m128i __B)1298*12c85518Srobert _mm_add_epi64(__m128i __A, __m128i __B) { 1299e5dd7070Spatrick return (__m128i)((__v2du)__A + (__v2du)__B); 1300e5dd7070Spatrick } 1301e5dd7070Spatrick 1302*12c85518Srobert extern __inline __m128i 1303*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epi8(__m128i __A,__m128i __B)1304*12c85518Srobert _mm_adds_epi8(__m128i __A, __m128i __B) { 1305e5dd7070Spatrick return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B); 1306e5dd7070Spatrick } 1307e5dd7070Spatrick 1308*12c85518Srobert extern __inline __m128i 1309*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epi16(__m128i __A,__m128i __B)1310*12c85518Srobert _mm_adds_epi16(__m128i __A, __m128i __B) { 1311e5dd7070Spatrick return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B); 1312e5dd7070Spatrick } 1313e5dd7070Spatrick 1314*12c85518Srobert extern __inline __m128i 1315*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epu8(__m128i __A,__m128i __B)1316*12c85518Srobert _mm_adds_epu8(__m128i __A, __m128i __B) { 1317e5dd7070Spatrick return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B); 1318e5dd7070Spatrick } 1319e5dd7070Spatrick 1320*12c85518Srobert extern __inline __m128i 1321*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epu16(__m128i __A,__m128i __B)1322*12c85518Srobert _mm_adds_epu16(__m128i __A, __m128i __B) { 1323e5dd7070Spatrick return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B); 1324e5dd7070Spatrick } 1325e5dd7070Spatrick 1326*12c85518Srobert extern __inline __m128i 1327*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi8(__m128i __A,__m128i __B)1328*12c85518Srobert _mm_sub_epi8(__m128i __A, __m128i __B) { 1329e5dd7070Spatrick return (__m128i)((__v16qu)__A - (__v16qu)__B); 1330e5dd7070Spatrick } 1331e5dd7070Spatrick 1332*12c85518Srobert extern __inline __m128i 1333*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi16(__m128i __A,__m128i __B)1334*12c85518Srobert _mm_sub_epi16(__m128i __A, __m128i __B) { 1335e5dd7070Spatrick return (__m128i)((__v8hu)__A - (__v8hu)__B); 1336e5dd7070Spatrick } 1337e5dd7070Spatrick 1338*12c85518Srobert extern __inline __m128i 1339*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi32(__m128i __A,__m128i __B)1340*12c85518Srobert _mm_sub_epi32(__m128i __A, __m128i __B) { 1341e5dd7070Spatrick return (__m128i)((__v4su)__A - (__v4su)__B); 1342e5dd7070Spatrick } 1343e5dd7070Spatrick 1344*12c85518Srobert extern __inline __m128i 1345*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi64(__m128i __A,__m128i __B)1346*12c85518Srobert _mm_sub_epi64(__m128i __A, __m128i __B) { 1347e5dd7070Spatrick return (__m128i)((__v2du)__A - (__v2du)__B); 1348e5dd7070Spatrick } 1349e5dd7070Spatrick 1350*12c85518Srobert extern __inline __m128i 1351*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_epi8(__m128i __A,__m128i __B)1352*12c85518Srobert _mm_subs_epi8(__m128i __A, __m128i __B) { 1353e5dd7070Spatrick return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B); 1354e5dd7070Spatrick } 1355e5dd7070Spatrick 1356*12c85518Srobert extern __inline __m128i 1357*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_epi16(__m128i __A,__m128i __B)1358*12c85518Srobert _mm_subs_epi16(__m128i __A, __m128i __B) { 1359e5dd7070Spatrick return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B); 1360e5dd7070Spatrick } 1361e5dd7070Spatrick 1362*12c85518Srobert extern __inline __m128i 1363*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_epu8(__m128i __A,__m128i __B)1364*12c85518Srobert _mm_subs_epu8(__m128i __A, __m128i __B) { 1365e5dd7070Spatrick return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B); 1366e5dd7070Spatrick } 1367e5dd7070Spatrick 1368*12c85518Srobert extern __inline __m128i 1369*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_epu16(__m128i __A,__m128i __B)1370*12c85518Srobert _mm_subs_epu16(__m128i __A, __m128i __B) { 1371e5dd7070Spatrick return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B); 1372e5dd7070Spatrick } 1373e5dd7070Spatrick 1374*12c85518Srobert extern __inline __m128i 1375*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_madd_epi16(__m128i __A,__m128i __B)1376*12c85518Srobert _mm_madd_epi16(__m128i __A, __m128i __B) { 1377*12c85518Srobert __vector signed int __zero = {0, 0, 0, 0}; 1378e5dd7070Spatrick 1379*12c85518Srobert return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero); 1380e5dd7070Spatrick } 1381e5dd7070Spatrick 1382*12c85518Srobert extern __inline __m128i 1383*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_epi16(__m128i __A,__m128i __B)1384*12c85518Srobert _mm_mulhi_epi16(__m128i __A, __m128i __B) { 1385*12c85518Srobert __vector signed int __w0, __w1; 1386e5dd7070Spatrick 1387*12c85518Srobert __vector unsigned char __xform1 = { 1388e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1389*12c85518Srobert 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1390*12c85518Srobert 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1391e5dd7070Spatrick #else 1392*12c85518Srobert 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 1393*12c85518Srobert 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1394e5dd7070Spatrick #endif 1395e5dd7070Spatrick }; 1396e5dd7070Spatrick 1397*12c85518Srobert __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B); 1398*12c85518Srobert __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B); 1399*12c85518Srobert return (__m128i)vec_perm(__w0, __w1, __xform1); 1400e5dd7070Spatrick } 1401e5dd7070Spatrick 1402*12c85518Srobert extern __inline __m128i 1403*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mullo_epi16(__m128i __A,__m128i __B)1404*12c85518Srobert _mm_mullo_epi16(__m128i __A, __m128i __B) { 1405e5dd7070Spatrick return (__m128i)((__v8hi)__A * (__v8hi)__B); 1406e5dd7070Spatrick } 1407e5dd7070Spatrick 1408*12c85518Srobert extern __inline __m64 1409*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_su32(__m64 __A,__m64 __B)1410*12c85518Srobert _mm_mul_su32(__m64 __A, __m64 __B) { 1411*12c85518Srobert unsigned int __a = __A; 1412*12c85518Srobert unsigned int __b = __B; 1413e5dd7070Spatrick 1414*12c85518Srobert return ((__m64)__a * (__m64)__b); 1415e5dd7070Spatrick } 1416e5dd7070Spatrick 1417*12c85518Srobert #ifdef _ARCH_PWR8 1418*12c85518Srobert extern __inline __m128i 1419*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_epu32(__m128i __A,__m128i __B)1420*12c85518Srobert _mm_mul_epu32(__m128i __A, __m128i __B) { 1421e5dd7070Spatrick #if __GNUC__ < 8 1422*12c85518Srobert __v2du __result; 1423e5dd7070Spatrick 1424e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1425e5dd7070Spatrick /* VMX Vector Multiply Odd Unsigned Word. */ 1426*12c85518Srobert __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :); 1427e5dd7070Spatrick #else 1428e5dd7070Spatrick /* VMX Vector Multiply Even Unsigned Word. */ 1429*12c85518Srobert __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :); 1430e5dd7070Spatrick #endif 1431*12c85518Srobert return (__m128i)__result; 1432e5dd7070Spatrick #else 1433e5dd7070Spatrick return (__m128i)vec_mule((__v4su)__A, (__v4su)__B); 1434e5dd7070Spatrick #endif 1435e5dd7070Spatrick } 1436*12c85518Srobert #endif 1437e5dd7070Spatrick 1438*12c85518Srobert extern __inline __m128i 1439*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi16(__m128i __A,int __B)1440*12c85518Srobert _mm_slli_epi16(__m128i __A, int __B) { 1441*12c85518Srobert __v8hu __lshift; 1442*12c85518Srobert __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0}; 1443e5dd7070Spatrick 1444*12c85518Srobert if (__B >= 0 && __B < 16) { 1445e5dd7070Spatrick if (__builtin_constant_p(__B)) 1446*12c85518Srobert __lshift = (__v8hu)vec_splat_s16(__B); 1447e5dd7070Spatrick else 1448*12c85518Srobert __lshift = vec_splats((unsigned short)__B); 1449e5dd7070Spatrick 1450*12c85518Srobert __result = vec_sl((__v8hi)__A, __lshift); 1451e5dd7070Spatrick } 1452e5dd7070Spatrick 1453*12c85518Srobert return (__m128i)__result; 1454e5dd7070Spatrick } 1455e5dd7070Spatrick 1456*12c85518Srobert extern __inline __m128i 1457*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi32(__m128i __A,int __B)1458*12c85518Srobert _mm_slli_epi32(__m128i __A, int __B) { 1459*12c85518Srobert __v4su __lshift; 1460*12c85518Srobert __v4si __result = {0, 0, 0, 0}; 1461e5dd7070Spatrick 1462*12c85518Srobert if (__B >= 0 && __B < 32) { 1463e5dd7070Spatrick if (__builtin_constant_p(__B) && __B < 16) 1464*12c85518Srobert __lshift = (__v4su)vec_splat_s32(__B); 1465e5dd7070Spatrick else 1466*12c85518Srobert __lshift = vec_splats((unsigned int)__B); 1467e5dd7070Spatrick 1468*12c85518Srobert __result = vec_sl((__v4si)__A, __lshift); 1469e5dd7070Spatrick } 1470e5dd7070Spatrick 1471*12c85518Srobert return (__m128i)__result; 1472e5dd7070Spatrick } 1473e5dd7070Spatrick 1474e5dd7070Spatrick #ifdef _ARCH_PWR8 1475*12c85518Srobert extern __inline __m128i 1476*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi64(__m128i __A,int __B)1477*12c85518Srobert _mm_slli_epi64(__m128i __A, int __B) { 1478*12c85518Srobert __v2du __lshift; 1479*12c85518Srobert __v2di __result = {0, 0}; 1480e5dd7070Spatrick 1481*12c85518Srobert if (__B >= 0 && __B < 64) { 1482e5dd7070Spatrick if (__builtin_constant_p(__B) && __B < 16) 1483*12c85518Srobert __lshift = (__v2du)vec_splat_s32(__B); 1484e5dd7070Spatrick else 1485*12c85518Srobert __lshift = (__v2du)vec_splats((unsigned int)__B); 1486e5dd7070Spatrick 1487*12c85518Srobert __result = vec_sl((__v2di)__A, __lshift); 1488e5dd7070Spatrick } 1489e5dd7070Spatrick 1490*12c85518Srobert return (__m128i)__result; 1491e5dd7070Spatrick } 1492e5dd7070Spatrick #endif 1493e5dd7070Spatrick 1494*12c85518Srobert extern __inline __m128i 1495*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_epi16(__m128i __A,int __B)1496*12c85518Srobert _mm_srai_epi16(__m128i __A, int __B) { 1497*12c85518Srobert __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15}; 1498*12c85518Srobert __v8hi __result; 1499e5dd7070Spatrick 1500*12c85518Srobert if (__B < 16) { 1501e5dd7070Spatrick if (__builtin_constant_p(__B)) 1502*12c85518Srobert __rshift = (__v8hu)vec_splat_s16(__B); 1503e5dd7070Spatrick else 1504*12c85518Srobert __rshift = vec_splats((unsigned short)__B); 1505e5dd7070Spatrick } 1506*12c85518Srobert __result = vec_sra((__v8hi)__A, __rshift); 1507e5dd7070Spatrick 1508*12c85518Srobert return (__m128i)__result; 1509e5dd7070Spatrick } 1510e5dd7070Spatrick 1511*12c85518Srobert extern __inline __m128i 1512*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_epi32(__m128i __A,int __B)1513*12c85518Srobert _mm_srai_epi32(__m128i __A, int __B) { 1514*12c85518Srobert __v4su __rshift = {31, 31, 31, 31}; 1515*12c85518Srobert __v4si __result; 1516e5dd7070Spatrick 1517*12c85518Srobert if (__B < 32) { 1518*12c85518Srobert if (__builtin_constant_p(__B)) { 1519e5dd7070Spatrick if (__B < 16) 1520*12c85518Srobert __rshift = (__v4su)vec_splat_s32(__B); 1521e5dd7070Spatrick else 1522*12c85518Srobert __rshift = (__v4su)vec_splats((unsigned int)__B); 1523*12c85518Srobert } else 1524*12c85518Srobert __rshift = vec_splats((unsigned int)__B); 1525e5dd7070Spatrick } 1526*12c85518Srobert __result = vec_sra((__v4si)__A, __rshift); 1527e5dd7070Spatrick 1528*12c85518Srobert return (__m128i)__result; 1529e5dd7070Spatrick } 1530e5dd7070Spatrick 1531*12c85518Srobert extern __inline __m128i 1532*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_bslli_si128(__m128i __A,const int __N)1533*12c85518Srobert _mm_bslli_si128(__m128i __A, const int __N) { 1534*12c85518Srobert __v16qu __result; 1535*12c85518Srobert const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1536e5dd7070Spatrick 1537e5dd7070Spatrick if (__N < 16) 1538*12c85518Srobert __result = vec_sld((__v16qu)__A, __zeros, __N); 1539e5dd7070Spatrick else 1540*12c85518Srobert __result = __zeros; 1541e5dd7070Spatrick 1542*12c85518Srobert return (__m128i)__result; 1543e5dd7070Spatrick } 1544e5dd7070Spatrick 1545*12c85518Srobert extern __inline __m128i 1546*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_bsrli_si128(__m128i __A,const int __N)1547*12c85518Srobert _mm_bsrli_si128(__m128i __A, const int __N) { 1548*12c85518Srobert __v16qu __result; 1549*12c85518Srobert const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1550e5dd7070Spatrick 1551e5dd7070Spatrick if (__N < 16) 1552e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1553e5dd7070Spatrick if (__builtin_constant_p(__N)) 1554e5dd7070Spatrick /* Would like to use Vector Shift Left Double by Octet 1555e5dd7070Spatrick Immediate here to use the immediate form and avoid 1556e5dd7070Spatrick load of __N * 8 value into a separate VR. */ 1557*12c85518Srobert __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N)); 1558e5dd7070Spatrick else 1559e5dd7070Spatrick #endif 1560e5dd7070Spatrick { 1561*12c85518Srobert __v16qu __shift = vec_splats((unsigned char)(__N * 8)); 1562e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1563*12c85518Srobert __result = vec_sro((__v16qu)__A, __shift); 1564e5dd7070Spatrick #else 1565*12c85518Srobert __result = vec_slo((__v16qu)__A, __shift); 1566e5dd7070Spatrick #endif 1567e5dd7070Spatrick } 1568e5dd7070Spatrick else 1569*12c85518Srobert __result = __zeros; 1570e5dd7070Spatrick 1571*12c85518Srobert return (__m128i)__result; 1572e5dd7070Spatrick } 1573e5dd7070Spatrick 1574*12c85518Srobert extern __inline __m128i 1575*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_si128(__m128i __A,const int __N)1576*12c85518Srobert _mm_srli_si128(__m128i __A, const int __N) { 1577e5dd7070Spatrick return _mm_bsrli_si128(__A, __N); 1578e5dd7070Spatrick } 1579e5dd7070Spatrick 1580*12c85518Srobert extern __inline __m128i 1581*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_si128(__m128i __A,const int _imm5)1582*12c85518Srobert _mm_slli_si128(__m128i __A, const int _imm5) { 1583*12c85518Srobert __v16qu __result; 1584*12c85518Srobert const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1585e5dd7070Spatrick 1586e5dd7070Spatrick if (_imm5 < 16) 1587e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1588*12c85518Srobert __result = vec_sld((__v16qu)__A, __zeros, _imm5); 1589e5dd7070Spatrick #else 1590*12c85518Srobert __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5)); 1591e5dd7070Spatrick #endif 1592e5dd7070Spatrick else 1593*12c85518Srobert __result = __zeros; 1594e5dd7070Spatrick 1595*12c85518Srobert return (__m128i)__result; 1596e5dd7070Spatrick } 1597e5dd7070Spatrick 1598*12c85518Srobert extern __inline __m128i 1599*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1600e5dd7070Spatrick _mm_srli_epi16(__m128i __A,int __B)1601*12c85518Srobert _mm_srli_epi16(__m128i __A, int __B) { 1602*12c85518Srobert __v8hu __rshift; 1603*12c85518Srobert __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0}; 1604e5dd7070Spatrick 1605*12c85518Srobert if (__B < 16) { 1606e5dd7070Spatrick if (__builtin_constant_p(__B)) 1607*12c85518Srobert __rshift = (__v8hu)vec_splat_s16(__B); 1608e5dd7070Spatrick else 1609*12c85518Srobert __rshift = vec_splats((unsigned short)__B); 1610e5dd7070Spatrick 1611*12c85518Srobert __result = vec_sr((__v8hi)__A, __rshift); 1612e5dd7070Spatrick } 1613e5dd7070Spatrick 1614*12c85518Srobert return (__m128i)__result; 1615e5dd7070Spatrick } 1616e5dd7070Spatrick 1617*12c85518Srobert extern __inline __m128i 1618*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi32(__m128i __A,int __B)1619*12c85518Srobert _mm_srli_epi32(__m128i __A, int __B) { 1620*12c85518Srobert __v4su __rshift; 1621*12c85518Srobert __v4si __result = {0, 0, 0, 0}; 1622e5dd7070Spatrick 1623*12c85518Srobert if (__B < 32) { 1624*12c85518Srobert if (__builtin_constant_p(__B)) { 1625e5dd7070Spatrick if (__B < 16) 1626*12c85518Srobert __rshift = (__v4su)vec_splat_s32(__B); 1627e5dd7070Spatrick else 1628*12c85518Srobert __rshift = (__v4su)vec_splats((unsigned int)__B); 1629*12c85518Srobert } else 1630*12c85518Srobert __rshift = vec_splats((unsigned int)__B); 1631e5dd7070Spatrick 1632*12c85518Srobert __result = vec_sr((__v4si)__A, __rshift); 1633e5dd7070Spatrick } 1634e5dd7070Spatrick 1635*12c85518Srobert return (__m128i)__result; 1636e5dd7070Spatrick } 1637e5dd7070Spatrick 1638e5dd7070Spatrick #ifdef _ARCH_PWR8 1639*12c85518Srobert extern __inline __m128i 1640*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi64(__m128i __A,int __B)1641*12c85518Srobert _mm_srli_epi64(__m128i __A, int __B) { 1642*12c85518Srobert __v2du __rshift; 1643*12c85518Srobert __v2di __result = {0, 0}; 1644e5dd7070Spatrick 1645*12c85518Srobert if (__B < 64) { 1646*12c85518Srobert if (__builtin_constant_p(__B)) { 1647e5dd7070Spatrick if (__B < 16) 1648*12c85518Srobert __rshift = (__v2du)vec_splat_s32(__B); 1649e5dd7070Spatrick else 1650*12c85518Srobert __rshift = (__v2du)vec_splats((unsigned long long)__B); 1651*12c85518Srobert } else 1652*12c85518Srobert __rshift = (__v2du)vec_splats((unsigned int)__B); 1653e5dd7070Spatrick 1654*12c85518Srobert __result = vec_sr((__v2di)__A, __rshift); 1655e5dd7070Spatrick } 1656e5dd7070Spatrick 1657*12c85518Srobert return (__m128i)__result; 1658e5dd7070Spatrick } 1659e5dd7070Spatrick #endif 1660e5dd7070Spatrick 1661*12c85518Srobert extern __inline __m128i 1662*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi16(__m128i __A,__m128i __B)1663*12c85518Srobert _mm_sll_epi16(__m128i __A, __m128i __B) { 1664*12c85518Srobert __v8hu __lshift; 1665*12c85518Srobert __vector __bool short __shmask; 1666*12c85518Srobert const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1667*12c85518Srobert __v8hu __result; 1668e5dd7070Spatrick 1669e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1670*12c85518Srobert __lshift = vec_splat((__v8hu)__B, 0); 1671e5dd7070Spatrick #else 1672*12c85518Srobert __lshift = vec_splat((__v8hu)__B, 3); 1673e5dd7070Spatrick #endif 1674*12c85518Srobert __shmask = vec_cmple(__lshift, __shmax); 1675*12c85518Srobert __result = vec_sl((__v8hu)__A, __lshift); 1676*12c85518Srobert __result = vec_sel((__v8hu)__shmask, __result, __shmask); 1677e5dd7070Spatrick 1678*12c85518Srobert return (__m128i)__result; 1679e5dd7070Spatrick } 1680e5dd7070Spatrick 1681*12c85518Srobert extern __inline __m128i 1682*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi32(__m128i __A,__m128i __B)1683*12c85518Srobert _mm_sll_epi32(__m128i __A, __m128i __B) { 1684*12c85518Srobert __v4su __lshift; 1685*12c85518Srobert __vector __bool int __shmask; 1686*12c85518Srobert const __v4su __shmax = {32, 32, 32, 32}; 1687*12c85518Srobert __v4su __result; 1688e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1689*12c85518Srobert __lshift = vec_splat((__v4su)__B, 0); 1690e5dd7070Spatrick #else 1691*12c85518Srobert __lshift = vec_splat((__v4su)__B, 1); 1692e5dd7070Spatrick #endif 1693*12c85518Srobert __shmask = vec_cmplt(__lshift, __shmax); 1694*12c85518Srobert __result = vec_sl((__v4su)__A, __lshift); 1695*12c85518Srobert __result = vec_sel((__v4su)__shmask, __result, __shmask); 1696e5dd7070Spatrick 1697*12c85518Srobert return (__m128i)__result; 1698e5dd7070Spatrick } 1699e5dd7070Spatrick 1700e5dd7070Spatrick #ifdef _ARCH_PWR8 1701*12c85518Srobert extern __inline __m128i 1702*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi64(__m128i __A,__m128i __B)1703*12c85518Srobert _mm_sll_epi64(__m128i __A, __m128i __B) { 1704*12c85518Srobert __v2du __lshift; 1705*12c85518Srobert __vector __bool long long __shmask; 1706*12c85518Srobert const __v2du __shmax = {64, 64}; 1707*12c85518Srobert __v2du __result; 1708e5dd7070Spatrick 1709*12c85518Srobert __lshift = vec_splat((__v2du)__B, 0); 1710*12c85518Srobert __shmask = vec_cmplt(__lshift, __shmax); 1711*12c85518Srobert __result = vec_sl((__v2du)__A, __lshift); 1712*12c85518Srobert __result = vec_sel((__v2du)__shmask, __result, __shmask); 1713e5dd7070Spatrick 1714*12c85518Srobert return (__m128i)__result; 1715e5dd7070Spatrick } 1716e5dd7070Spatrick #endif 1717e5dd7070Spatrick 1718*12c85518Srobert extern __inline __m128i 1719*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_epi16(__m128i __A,__m128i __B)1720*12c85518Srobert _mm_sra_epi16(__m128i __A, __m128i __B) { 1721*12c85518Srobert const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1722*12c85518Srobert __v8hu __rshift; 1723*12c85518Srobert __v8hi __result; 1724e5dd7070Spatrick 1725e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1726*12c85518Srobert __rshift = vec_splat((__v8hu)__B, 0); 1727e5dd7070Spatrick #else 1728*12c85518Srobert __rshift = vec_splat((__v8hu)__B, 3); 1729e5dd7070Spatrick #endif 1730*12c85518Srobert __rshift = vec_min(__rshift, __rshmax); 1731*12c85518Srobert __result = vec_sra((__v8hi)__A, __rshift); 1732e5dd7070Spatrick 1733*12c85518Srobert return (__m128i)__result; 1734e5dd7070Spatrick } 1735e5dd7070Spatrick 1736*12c85518Srobert extern __inline __m128i 1737*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_epi32(__m128i __A,__m128i __B)1738*12c85518Srobert _mm_sra_epi32(__m128i __A, __m128i __B) { 1739*12c85518Srobert const __v4su __rshmax = {31, 31, 31, 31}; 1740*12c85518Srobert __v4su __rshift; 1741*12c85518Srobert __v4si __result; 1742e5dd7070Spatrick 1743e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1744*12c85518Srobert __rshift = vec_splat((__v4su)__B, 0); 1745e5dd7070Spatrick #else 1746*12c85518Srobert __rshift = vec_splat((__v4su)__B, 1); 1747e5dd7070Spatrick #endif 1748*12c85518Srobert __rshift = vec_min(__rshift, __rshmax); 1749*12c85518Srobert __result = vec_sra((__v4si)__A, __rshift); 1750e5dd7070Spatrick 1751*12c85518Srobert return (__m128i)__result; 1752e5dd7070Spatrick } 1753e5dd7070Spatrick 1754*12c85518Srobert extern __inline __m128i 1755*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_epi16(__m128i __A,__m128i __B)1756*12c85518Srobert _mm_srl_epi16(__m128i __A, __m128i __B) { 1757*12c85518Srobert __v8hu __rshift; 1758*12c85518Srobert __vector __bool short __shmask; 1759*12c85518Srobert const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1760*12c85518Srobert __v8hu __result; 1761e5dd7070Spatrick 1762e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1763*12c85518Srobert __rshift = vec_splat((__v8hu)__B, 0); 1764e5dd7070Spatrick #else 1765*12c85518Srobert __rshift = vec_splat((__v8hu)__B, 3); 1766e5dd7070Spatrick #endif 1767*12c85518Srobert __shmask = vec_cmple(__rshift, __shmax); 1768*12c85518Srobert __result = vec_sr((__v8hu)__A, __rshift); 1769*12c85518Srobert __result = vec_sel((__v8hu)__shmask, __result, __shmask); 1770e5dd7070Spatrick 1771*12c85518Srobert return (__m128i)__result; 1772e5dd7070Spatrick } 1773e5dd7070Spatrick 1774*12c85518Srobert extern __inline __m128i 1775*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_epi32(__m128i __A,__m128i __B)1776*12c85518Srobert _mm_srl_epi32(__m128i __A, __m128i __B) { 1777*12c85518Srobert __v4su __rshift; 1778*12c85518Srobert __vector __bool int __shmask; 1779*12c85518Srobert const __v4su __shmax = {32, 32, 32, 32}; 1780*12c85518Srobert __v4su __result; 1781e5dd7070Spatrick 1782e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1783*12c85518Srobert __rshift = vec_splat((__v4su)__B, 0); 1784e5dd7070Spatrick #else 1785*12c85518Srobert __rshift = vec_splat((__v4su)__B, 1); 1786e5dd7070Spatrick #endif 1787*12c85518Srobert __shmask = vec_cmplt(__rshift, __shmax); 1788*12c85518Srobert __result = vec_sr((__v4su)__A, __rshift); 1789*12c85518Srobert __result = vec_sel((__v4su)__shmask, __result, __shmask); 1790e5dd7070Spatrick 1791*12c85518Srobert return (__m128i)__result; 1792e5dd7070Spatrick } 1793e5dd7070Spatrick 1794e5dd7070Spatrick #ifdef _ARCH_PWR8 1795*12c85518Srobert extern __inline __m128i 1796*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_epi64(__m128i __A,__m128i __B)1797*12c85518Srobert _mm_srl_epi64(__m128i __A, __m128i __B) { 1798*12c85518Srobert __v2du __rshift; 1799*12c85518Srobert __vector __bool long long __shmask; 1800*12c85518Srobert const __v2du __shmax = {64, 64}; 1801*12c85518Srobert __v2du __result; 1802e5dd7070Spatrick 1803*12c85518Srobert __rshift = vec_splat((__v2du)__B, 0); 1804*12c85518Srobert __shmask = vec_cmplt(__rshift, __shmax); 1805*12c85518Srobert __result = vec_sr((__v2du)__A, __rshift); 1806*12c85518Srobert __result = vec_sel((__v2du)__shmask, __result, __shmask); 1807e5dd7070Spatrick 1808*12c85518Srobert return (__m128i)__result; 1809e5dd7070Spatrick } 1810e5dd7070Spatrick #endif 1811e5dd7070Spatrick 1812*12c85518Srobert extern __inline __m128d 1813*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_and_pd(__m128d __A,__m128d __B)1814*12c85518Srobert _mm_and_pd(__m128d __A, __m128d __B) { 1815e5dd7070Spatrick return (vec_and((__v2df)__A, (__v2df)__B)); 1816e5dd7070Spatrick } 1817e5dd7070Spatrick 1818*12c85518Srobert extern __inline __m128d 1819*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_andnot_pd(__m128d __A,__m128d __B)1820*12c85518Srobert _mm_andnot_pd(__m128d __A, __m128d __B) { 1821e5dd7070Spatrick return (vec_andc((__v2df)__B, (__v2df)__A)); 1822e5dd7070Spatrick } 1823e5dd7070Spatrick 1824*12c85518Srobert extern __inline __m128d 1825*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_or_pd(__m128d __A,__m128d __B)1826*12c85518Srobert _mm_or_pd(__m128d __A, __m128d __B) { 1827e5dd7070Spatrick return (vec_or((__v2df)__A, (__v2df)__B)); 1828e5dd7070Spatrick } 1829e5dd7070Spatrick 1830*12c85518Srobert extern __inline __m128d 1831*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_xor_pd(__m128d __A,__m128d __B)1832*12c85518Srobert _mm_xor_pd(__m128d __A, __m128d __B) { 1833e5dd7070Spatrick return (vec_xor((__v2df)__A, (__v2df)__B)); 1834e5dd7070Spatrick } 1835e5dd7070Spatrick 1836*12c85518Srobert extern __inline __m128i 1837*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_and_si128(__m128i __A,__m128i __B)1838*12c85518Srobert _mm_and_si128(__m128i __A, __m128i __B) { 1839e5dd7070Spatrick return (__m128i)vec_and((__v2di)__A, (__v2di)__B); 1840e5dd7070Spatrick } 1841e5dd7070Spatrick 1842*12c85518Srobert extern __inline __m128i 1843*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_andnot_si128(__m128i __A,__m128i __B)1844*12c85518Srobert _mm_andnot_si128(__m128i __A, __m128i __B) { 1845e5dd7070Spatrick return (__m128i)vec_andc((__v2di)__B, (__v2di)__A); 1846e5dd7070Spatrick } 1847e5dd7070Spatrick 1848*12c85518Srobert extern __inline __m128i 1849*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_or_si128(__m128i __A,__m128i __B)1850*12c85518Srobert _mm_or_si128(__m128i __A, __m128i __B) { 1851e5dd7070Spatrick return (__m128i)vec_or((__v2di)__A, (__v2di)__B); 1852e5dd7070Spatrick } 1853e5dd7070Spatrick 1854*12c85518Srobert extern __inline __m128i 1855*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_xor_si128(__m128i __A,__m128i __B)1856*12c85518Srobert _mm_xor_si128(__m128i __A, __m128i __B) { 1857e5dd7070Spatrick return (__m128i)vec_xor((__v2di)__A, (__v2di)__B); 1858e5dd7070Spatrick } 1859e5dd7070Spatrick 1860*12c85518Srobert extern __inline __m128i 1861*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi8(__m128i __A,__m128i __B)1862*12c85518Srobert _mm_cmpeq_epi8(__m128i __A, __m128i __B) { 1863e5dd7070Spatrick return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B); 1864e5dd7070Spatrick } 1865e5dd7070Spatrick 1866*12c85518Srobert extern __inline __m128i 1867*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi16(__m128i __A,__m128i __B)1868*12c85518Srobert _mm_cmpeq_epi16(__m128i __A, __m128i __B) { 1869e5dd7070Spatrick return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B); 1870e5dd7070Spatrick } 1871e5dd7070Spatrick 1872*12c85518Srobert extern __inline __m128i 1873*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi32(__m128i __A,__m128i __B)1874*12c85518Srobert _mm_cmpeq_epi32(__m128i __A, __m128i __B) { 1875e5dd7070Spatrick return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B); 1876e5dd7070Spatrick } 1877e5dd7070Spatrick 1878*12c85518Srobert extern __inline __m128i 1879*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_epi8(__m128i __A,__m128i __B)1880*12c85518Srobert _mm_cmplt_epi8(__m128i __A, __m128i __B) { 1881e5dd7070Spatrick return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B); 1882e5dd7070Spatrick } 1883e5dd7070Spatrick 1884*12c85518Srobert extern __inline __m128i 1885*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_epi16(__m128i __A,__m128i __B)1886*12c85518Srobert _mm_cmplt_epi16(__m128i __A, __m128i __B) { 1887e5dd7070Spatrick return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B); 1888e5dd7070Spatrick } 1889e5dd7070Spatrick 1890*12c85518Srobert extern __inline __m128i 1891*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_epi32(__m128i __A,__m128i __B)1892*12c85518Srobert _mm_cmplt_epi32(__m128i __A, __m128i __B) { 1893e5dd7070Spatrick return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B); 1894e5dd7070Spatrick } 1895e5dd7070Spatrick 1896*12c85518Srobert extern __inline __m128i 1897*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi8(__m128i __A,__m128i __B)1898*12c85518Srobert _mm_cmpgt_epi8(__m128i __A, __m128i __B) { 1899e5dd7070Spatrick return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B); 1900e5dd7070Spatrick } 1901e5dd7070Spatrick 1902*12c85518Srobert extern __inline __m128i 1903*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi16(__m128i __A,__m128i __B)1904*12c85518Srobert _mm_cmpgt_epi16(__m128i __A, __m128i __B) { 1905e5dd7070Spatrick return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B); 1906e5dd7070Spatrick } 1907e5dd7070Spatrick 1908*12c85518Srobert extern __inline __m128i 1909*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi32(__m128i __A,__m128i __B)1910*12c85518Srobert _mm_cmpgt_epi32(__m128i __A, __m128i __B) { 1911e5dd7070Spatrick return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B); 1912e5dd7070Spatrick } 1913e5dd7070Spatrick 1914*12c85518Srobert extern __inline int 1915*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_extract_epi16(__m128i const __A,int const __N)1916*12c85518Srobert _mm_extract_epi16(__m128i const __A, int const __N) { 1917e5dd7070Spatrick return (unsigned short)((__v8hi)__A)[__N & 7]; 1918e5dd7070Spatrick } 1919e5dd7070Spatrick 1920*12c85518Srobert extern __inline __m128i 1921*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_insert_epi16(__m128i const __A,int const __D,int const __N)1922*12c85518Srobert _mm_insert_epi16(__m128i const __A, int const __D, int const __N) { 1923*12c85518Srobert __v8hi __result = (__v8hi)__A; 1924e5dd7070Spatrick 1925*12c85518Srobert __result[(__N & 7)] = __D; 1926e5dd7070Spatrick 1927*12c85518Srobert return (__m128i)__result; 1928e5dd7070Spatrick } 1929e5dd7070Spatrick 1930*12c85518Srobert extern __inline __m128i 1931*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_epi16(__m128i __A,__m128i __B)1932*12c85518Srobert _mm_max_epi16(__m128i __A, __m128i __B) { 1933e5dd7070Spatrick return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B); 1934e5dd7070Spatrick } 1935e5dd7070Spatrick 1936*12c85518Srobert extern __inline __m128i 1937*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_epu8(__m128i __A,__m128i __B)1938*12c85518Srobert _mm_max_epu8(__m128i __A, __m128i __B) { 1939e5dd7070Spatrick return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B); 1940e5dd7070Spatrick } 1941e5dd7070Spatrick 1942*12c85518Srobert extern __inline __m128i 1943*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_epi16(__m128i __A,__m128i __B)1944*12c85518Srobert _mm_min_epi16(__m128i __A, __m128i __B) { 1945e5dd7070Spatrick return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B); 1946e5dd7070Spatrick } 1947e5dd7070Spatrick 1948*12c85518Srobert extern __inline __m128i 1949*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_epu8(__m128i __A,__m128i __B)1950*12c85518Srobert _mm_min_epu8(__m128i __A, __m128i __B) { 1951e5dd7070Spatrick return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B); 1952e5dd7070Spatrick } 1953e5dd7070Spatrick 1954e5dd7070Spatrick #ifdef _ARCH_PWR8 1955e5dd7070Spatrick /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1956e5dd7070Spatrick 1957*12c85518Srobert /* Return a mask created from the most significant bit of each 8-bit 1958*12c85518Srobert element in A. */ 1959*12c85518Srobert extern __inline int 1960*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_epi8(__m128i __A)1961*12c85518Srobert _mm_movemask_epi8(__m128i __A) { 1962*12c85518Srobert #ifdef _ARCH_PWR10 1963*12c85518Srobert return vec_extractm((__v16qu)__A); 1964*12c85518Srobert #else 1965*12c85518Srobert __vector unsigned long long __result; 1966*12c85518Srobert static const __vector unsigned char __perm_mask = { 1967e5dd7070Spatrick 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 1968*12c85518Srobert 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00}; 1969e5dd7070Spatrick 1970*12c85518Srobert __result = ((__vector unsigned long long)vec_vbpermq( 1971*12c85518Srobert (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); 1972e5dd7070Spatrick 1973e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1974*12c85518Srobert return __result[1]; 1975e5dd7070Spatrick #else 1976*12c85518Srobert return __result[0]; 1977e5dd7070Spatrick #endif 1978*12c85518Srobert #endif /* !_ARCH_PWR10 */ 1979e5dd7070Spatrick } 1980e5dd7070Spatrick #endif /* _ARCH_PWR8 */ 1981e5dd7070Spatrick 1982*12c85518Srobert extern __inline __m128i 1983*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_epu16(__m128i __A,__m128i __B)1984*12c85518Srobert _mm_mulhi_epu16(__m128i __A, __m128i __B) { 1985*12c85518Srobert __v4su __w0, __w1; 1986*12c85518Srobert __v16qu __xform1 = { 1987e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 1988*12c85518Srobert 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1989*12c85518Srobert 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1990e5dd7070Spatrick #else 1991*12c85518Srobert 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 1992*12c85518Srobert 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1993e5dd7070Spatrick #endif 1994e5dd7070Spatrick }; 1995e5dd7070Spatrick 1996*12c85518Srobert __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B); 1997*12c85518Srobert __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B); 1998*12c85518Srobert return (__m128i)vec_perm(__w0, __w1, __xform1); 1999e5dd7070Spatrick } 2000e5dd7070Spatrick 2001*12c85518Srobert extern __inline __m128i 2002*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shufflehi_epi16(__m128i __A,const int __mask)2003*12c85518Srobert _mm_shufflehi_epi16(__m128i __A, const int __mask) { 2004*12c85518Srobert unsigned long __element_selector_98 = __mask & 0x03; 2005*12c85518Srobert unsigned long __element_selector_BA = (__mask >> 2) & 0x03; 2006*12c85518Srobert unsigned long __element_selector_DC = (__mask >> 4) & 0x03; 2007*12c85518Srobert unsigned long __element_selector_FE = (__mask >> 6) & 0x03; 2008*12c85518Srobert static const unsigned short __permute_selectors[4] = { 2009e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 2010e5dd7070Spatrick 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 2011e5dd7070Spatrick #else 2012e5dd7070Spatrick 0x0809, 0x0A0B, 0x0C0D, 0x0E0F 2013e5dd7070Spatrick #endif 2014e5dd7070Spatrick }; 2015*12c85518Srobert __v2du __pmask = 2016e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 2017e5dd7070Spatrick {0x1716151413121110UL, 0UL}; 2018e5dd7070Spatrick #else 2019e5dd7070Spatrick {0x1011121314151617UL, 0UL}; 2020e5dd7070Spatrick #endif 2021*12c85518Srobert __m64_union __t; 2022*12c85518Srobert __v2du __a, __r; 2023e5dd7070Spatrick 2024*12c85518Srobert __t.as_short[0] = __permute_selectors[__element_selector_98]; 2025*12c85518Srobert __t.as_short[1] = __permute_selectors[__element_selector_BA]; 2026*12c85518Srobert __t.as_short[2] = __permute_selectors[__element_selector_DC]; 2027*12c85518Srobert __t.as_short[3] = __permute_selectors[__element_selector_FE]; 2028*12c85518Srobert __pmask[1] = __t.as_m64; 2029*12c85518Srobert __a = (__v2du)__A; 2030*12c85518Srobert __r = vec_perm(__a, __a, (__vector unsigned char)__pmask); 2031*12c85518Srobert return (__m128i)__r; 2032e5dd7070Spatrick } 2033e5dd7070Spatrick 2034*12c85518Srobert extern __inline __m128i 2035*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shufflelo_epi16(__m128i __A,const int __mask)2036*12c85518Srobert _mm_shufflelo_epi16(__m128i __A, const int __mask) { 2037*12c85518Srobert unsigned long __element_selector_10 = __mask & 0x03; 2038*12c85518Srobert unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 2039*12c85518Srobert unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 2040*12c85518Srobert unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 2041*12c85518Srobert static const unsigned short __permute_selectors[4] = { 2042e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 2043e5dd7070Spatrick 0x0100, 0x0302, 0x0504, 0x0706 2044e5dd7070Spatrick #else 2045e5dd7070Spatrick 0x0001, 0x0203, 0x0405, 0x0607 2046e5dd7070Spatrick #endif 2047e5dd7070Spatrick }; 2048*12c85518Srobert __v2du __pmask = 2049e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 2050e5dd7070Spatrick {0UL, 0x1f1e1d1c1b1a1918UL}; 2051e5dd7070Spatrick #else 2052e5dd7070Spatrick {0UL, 0x18191a1b1c1d1e1fUL}; 2053e5dd7070Spatrick #endif 2054*12c85518Srobert __m64_union __t; 2055*12c85518Srobert __v2du __a, __r; 2056*12c85518Srobert __t.as_short[0] = __permute_selectors[__element_selector_10]; 2057*12c85518Srobert __t.as_short[1] = __permute_selectors[__element_selector_32]; 2058*12c85518Srobert __t.as_short[2] = __permute_selectors[__element_selector_54]; 2059*12c85518Srobert __t.as_short[3] = __permute_selectors[__element_selector_76]; 2060*12c85518Srobert __pmask[0] = __t.as_m64; 2061*12c85518Srobert __a = (__v2du)__A; 2062*12c85518Srobert __r = vec_perm(__a, __a, (__vector unsigned char)__pmask); 2063*12c85518Srobert return (__m128i)__r; 2064e5dd7070Spatrick } 2065e5dd7070Spatrick 2066*12c85518Srobert extern __inline __m128i 2067*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shuffle_epi32(__m128i __A,const int __mask)2068*12c85518Srobert _mm_shuffle_epi32(__m128i __A, const int __mask) { 2069*12c85518Srobert unsigned long __element_selector_10 = __mask & 0x03; 2070*12c85518Srobert unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 2071*12c85518Srobert unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 2072*12c85518Srobert unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 2073*12c85518Srobert static const unsigned int __permute_selectors[4] = { 2074e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__ 2075e5dd7070Spatrick 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 2076e5dd7070Spatrick #else 2077e5dd7070Spatrick 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 2078e5dd7070Spatrick #endif 2079e5dd7070Spatrick }; 2080*12c85518Srobert __v4su __t; 2081e5dd7070Spatrick 2082*12c85518Srobert __t[0] = __permute_selectors[__element_selector_10]; 2083*12c85518Srobert __t[1] = __permute_selectors[__element_selector_32]; 2084*12c85518Srobert __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; 2085*12c85518Srobert __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; 2086*12c85518Srobert return (__m128i)vec_perm((__v4si)__A, (__v4si)__A, 2087*12c85518Srobert (__vector unsigned char)__t); 2088e5dd7070Spatrick } 2089e5dd7070Spatrick 2090*12c85518Srobert extern __inline void 2091*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskmoveu_si128(__m128i __A,__m128i __B,char * __C)2092*12c85518Srobert _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) { 2093*12c85518Srobert __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; 2094*12c85518Srobert __v16qu __mask, __tmp; 2095*12c85518Srobert __m128i_u *__p = (__m128i_u *)__C; 2096e5dd7070Spatrick 2097*12c85518Srobert __tmp = (__v16qu)_mm_loadu_si128(__p); 2098*12c85518Srobert __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit); 2099*12c85518Srobert __tmp = vec_sel(__tmp, (__v16qu)__A, __mask); 2100*12c85518Srobert _mm_storeu_si128(__p, (__m128i)__tmp); 2101e5dd7070Spatrick } 2102e5dd7070Spatrick 2103*12c85518Srobert extern __inline __m128i 2104*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_avg_epu8(__m128i __A,__m128i __B)2105*12c85518Srobert _mm_avg_epu8(__m128i __A, __m128i __B) { 2106e5dd7070Spatrick return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B); 2107e5dd7070Spatrick } 2108e5dd7070Spatrick 2109*12c85518Srobert extern __inline __m128i 2110*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_avg_epu16(__m128i __A,__m128i __B)2111*12c85518Srobert _mm_avg_epu16(__m128i __A, __m128i __B) { 2112e5dd7070Spatrick return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B); 2113e5dd7070Spatrick } 2114e5dd7070Spatrick 2115*12c85518Srobert extern __inline __m128i 2116*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sad_epu8(__m128i __A,__m128i __B)2117*12c85518Srobert _mm_sad_epu8(__m128i __A, __m128i __B) { 2118*12c85518Srobert __v16qu __a, __b; 2119*12c85518Srobert __v16qu __vabsdiff; 2120*12c85518Srobert __v4si __vsum; 2121*12c85518Srobert const __v4su __zero = {0, 0, 0, 0}; 2122*12c85518Srobert __v4si __result; 2123e5dd7070Spatrick 2124*12c85518Srobert __a = (__v16qu)__A; 2125*12c85518Srobert __b = (__v16qu)__B; 2126*12c85518Srobert #ifndef _ARCH_PWR9 2127*12c85518Srobert __v16qu __vmin = vec_min(__a, __b); 2128*12c85518Srobert __v16qu __vmax = vec_max(__a, __b); 2129*12c85518Srobert __vabsdiff = vec_sub(__vmax, __vmin); 2130e5dd7070Spatrick #else 2131*12c85518Srobert __vabsdiff = vec_absd(__a, __b); 2132e5dd7070Spatrick #endif 2133*12c85518Srobert /* Sum four groups of bytes into integers. */ 2134*12c85518Srobert __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero); 2135*12c85518Srobert #ifdef __LITTLE_ENDIAN__ 2136*12c85518Srobert /* Sum across four integers with two integer results. */ 2137*12c85518Srobert __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero)); 2138*12c85518Srobert /* Note: vec_sum2s could be used here, but on little-endian, vector 2139*12c85518Srobert shifts are added that are not needed for this use-case. 2140*12c85518Srobert A vector shift to correctly position the 32-bit integer results 2141*12c85518Srobert (currently at [0] and [2]) to [1] and [3] would then need to be 2142*12c85518Srobert swapped back again since the desired results are two 64-bit 2143*12c85518Srobert integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */ 2144*12c85518Srobert #else 2145*12c85518Srobert /* Sum across four integers with two integer results. */ 2146*12c85518Srobert __result = vec_sum2s(__vsum, (__vector signed int)__zero); 2147e5dd7070Spatrick /* Rotate the sums into the correct position. */ 2148*12c85518Srobert __result = vec_sld(__result, __result, 6); 2149*12c85518Srobert #endif 2150*12c85518Srobert return (__m128i)__result; 2151e5dd7070Spatrick } 2152e5dd7070Spatrick 2153*12c85518Srobert extern __inline void 2154*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_si32(int * __A,int __B)2155*12c85518Srobert _mm_stream_si32(int *__A, int __B) { 2156e5dd7070Spatrick /* Use the data cache block touch for store transient. */ 2157*12c85518Srobert __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2158e5dd7070Spatrick *__A = __B; 2159e5dd7070Spatrick } 2160e5dd7070Spatrick 2161*12c85518Srobert extern __inline void 2162*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_si64(long long int * __A,long long int __B)2163*12c85518Srobert _mm_stream_si64(long long int *__A, long long int __B) { 2164e5dd7070Spatrick /* Use the data cache block touch for store transient. */ 2165*12c85518Srobert __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory"); 2166e5dd7070Spatrick *__A = __B; 2167e5dd7070Spatrick } 2168e5dd7070Spatrick 2169*12c85518Srobert extern __inline void 2170*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_si128(__m128i * __A,__m128i __B)2171*12c85518Srobert _mm_stream_si128(__m128i *__A, __m128i __B) { 2172e5dd7070Spatrick /* Use the data cache block touch for store transient. */ 2173*12c85518Srobert __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2174e5dd7070Spatrick *__A = __B; 2175e5dd7070Spatrick } 2176e5dd7070Spatrick 2177*12c85518Srobert extern __inline void 2178*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_pd(double * __A,__m128d __B)2179*12c85518Srobert _mm_stream_pd(double *__A, __m128d __B) { 2180e5dd7070Spatrick /* Use the data cache block touch for store transient. */ 2181*12c85518Srobert __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2182e5dd7070Spatrick *(__m128d *)__A = __B; 2183e5dd7070Spatrick } 2184e5dd7070Spatrick 2185*12c85518Srobert extern __inline void 2186*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_clflush(void const * __A)2187*12c85518Srobert _mm_clflush(void const *__A) { 2188e5dd7070Spatrick /* Use the data cache block flush. */ 2189*12c85518Srobert __asm__("dcbf 0,%0" : : "b"(__A) : "memory"); 2190e5dd7070Spatrick } 2191e5dd7070Spatrick 2192*12c85518Srobert extern __inline void 2193*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_lfence(void)2194*12c85518Srobert _mm_lfence(void) { 2195e5dd7070Spatrick /* Use light weight sync for load to load ordering. */ 2196e5dd7070Spatrick __atomic_thread_fence(__ATOMIC_RELEASE); 2197e5dd7070Spatrick } 2198e5dd7070Spatrick 2199*12c85518Srobert extern __inline void 2200*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mfence(void)2201*12c85518Srobert _mm_mfence(void) { 2202e5dd7070Spatrick /* Use heavy weight sync for any to any ordering. */ 2203e5dd7070Spatrick __atomic_thread_fence(__ATOMIC_SEQ_CST); 2204e5dd7070Spatrick } 2205e5dd7070Spatrick 2206*12c85518Srobert extern __inline __m128i 2207*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi32_si128(int __A)2208*12c85518Srobert _mm_cvtsi32_si128(int __A) { 2209e5dd7070Spatrick return _mm_set_epi32(0, 0, 0, __A); 2210e5dd7070Spatrick } 2211e5dd7070Spatrick 2212*12c85518Srobert extern __inline __m128i 2213*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_si128(long long __A)2214*12c85518Srobert _mm_cvtsi64_si128(long long __A) { 2215e5dd7070Spatrick return __extension__(__m128i)(__v2di){__A, 0LL}; 2216e5dd7070Spatrick } 2217e5dd7070Spatrick 2218e5dd7070Spatrick /* Microsoft intrinsic. */ 2219*12c85518Srobert extern __inline __m128i 2220*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64x_si128(long long __A)2221*12c85518Srobert _mm_cvtsi64x_si128(long long __A) { 2222e5dd7070Spatrick return __extension__(__m128i)(__v2di){__A, 0LL}; 2223e5dd7070Spatrick } 2224e5dd7070Spatrick 2225e5dd7070Spatrick /* Casts between various SP, DP, INT vector types. Note that these do no 2226e5dd7070Spatrick conversion of values, they just change the type. */ 2227*12c85518Srobert extern __inline __m128 2228*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castpd_ps(__m128d __A)2229*12c85518Srobert _mm_castpd_ps(__m128d __A) { 2230e5dd7070Spatrick return (__m128)__A; 2231e5dd7070Spatrick } 2232e5dd7070Spatrick 2233*12c85518Srobert extern __inline __m128i 2234*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castpd_si128(__m128d __A)2235*12c85518Srobert _mm_castpd_si128(__m128d __A) { 2236e5dd7070Spatrick return (__m128i)__A; 2237e5dd7070Spatrick } 2238e5dd7070Spatrick 2239*12c85518Srobert extern __inline __m128d 2240*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castps_pd(__m128 __A)2241*12c85518Srobert _mm_castps_pd(__m128 __A) { 2242e5dd7070Spatrick return (__m128d)__A; 2243e5dd7070Spatrick } 2244e5dd7070Spatrick 2245*12c85518Srobert extern __inline __m128i 2246*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castps_si128(__m128 __A)2247*12c85518Srobert _mm_castps_si128(__m128 __A) { 2248e5dd7070Spatrick return (__m128i)__A; 2249e5dd7070Spatrick } 2250e5dd7070Spatrick 2251*12c85518Srobert extern __inline __m128 2252*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castsi128_ps(__m128i __A)2253*12c85518Srobert _mm_castsi128_ps(__m128i __A) { 2254e5dd7070Spatrick return (__m128)__A; 2255e5dd7070Spatrick } 2256e5dd7070Spatrick 2257*12c85518Srobert extern __inline __m128d 2258*12c85518Srobert __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castsi128_pd(__m128i __A)2259*12c85518Srobert _mm_castsi128_pd(__m128i __A) { 2260e5dd7070Spatrick return (__m128d)__A; 2261e5dd7070Spatrick } 2262e5dd7070Spatrick 2263e5dd7070Spatrick #else 2264e5dd7070Spatrick #include_next <emmintrin.h> 2265*12c85518Srobert #endif /* defined(__powerpc64__) && \ 2266*12c85518Srobert * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ 2267e5dd7070Spatrick 2268e5dd7070Spatrick #endif /* EMMINTRIN_H_ */ 2269