1 /*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header is distributed to simplify porting x86_64 code that 15 makes explicit use of Intel intrinsics to powerpc64le. 16 17 It is the user's responsibility to determine if the results are 18 acceptable and make additional changes as necessary. 19 20 Note that much code that uses Intel intrinsics can be rewritten in 21 standard C or GNU C extensions, which are more portable and better 22 optimized across multiple targets. */ 23 #endif 24 25 #ifndef TMMINTRIN_H_ 26 #define TMMINTRIN_H_ 27 28 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) 29 30 #include <altivec.h> 31 32 /* We need definitions from the SSE header files. */ 33 #include <pmmintrin.h> 34 35 extern __inline __m128i 36 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 37 _mm_abs_epi16(__m128i __A) { 38 return (__m128i)vec_abs((__v8hi)__A); 39 } 40 41 extern __inline __m128i 42 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 43 _mm_abs_epi32(__m128i __A) { 44 return (__m128i)vec_abs((__v4si)__A); 45 } 46 47 extern __inline __m128i 48 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 49 _mm_abs_epi8(__m128i __A) { 50 return (__m128i)vec_abs((__v16qi)__A); 51 } 52 53 extern __inline __m64 54 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 55 _mm_abs_pi16(__m64 __A) { 56 __v8hi __B = (__v8hi)(__v2du){__A, __A}; 57 return (__m64)((__v2du)vec_abs(__B))[0]; 58 } 59 60 extern __inline __m64 61 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 62 _mm_abs_pi32(__m64 __A) { 63 __v4si __B = (__v4si)(__v2du){__A, __A}; 64 return (__m64)((__v2du)vec_abs(__B))[0]; 65 } 66 67 extern __inline __m64 68 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 69 _mm_abs_pi8(__m64 __A) { 70 __v16qi __B = (__v16qi)(__v2du){__A, __A}; 71 return (__m64)((__v2du)vec_abs(__B))[0]; 72 } 73 74 extern __inline __m128i 75 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 76 _mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) { 77 if (__builtin_constant_p(__count) && __count < 16) { 78 #ifdef __LITTLE_ENDIAN__ 79 __A = (__m128i)vec_reve((__v16qu)__A); 80 __B = (__m128i)vec_reve((__v16qu)__B); 81 #endif 82 __A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count); 83 #ifdef __LITTLE_ENDIAN__ 84 __A = (__m128i)vec_reve((__v16qu)__A); 85 #endif 86 return __A; 87 } 88 89 if (__count == 0) 90 return __B; 91 92 if (__count >= 16) { 93 if (__count >= 32) { 94 const __v16qu __zero = {0}; 95 return (__m128i)__zero; 96 } else { 97 const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8)); 98 #ifdef __LITTLE_ENDIAN__ 99 return (__m128i)vec_sro((__v16qu)__A, __shift); 100 #else 101 return (__m128i)vec_slo((__v16qu)__A, __shift); 102 #endif 103 } 104 } else { 105 const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8)); 106 const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8)); 107 #ifdef __LITTLE_ENDIAN__ 108 __A = (__m128i)vec_slo((__v16qu)__A, __shiftA); 109 __B = (__m128i)vec_sro((__v16qu)__B, __shiftB); 110 #else 111 __A = (__m128i)vec_sro((__v16qu)__A, __shiftA); 112 __B = (__m128i)vec_slo((__v16qu)__B, __shiftB); 113 #endif 114 return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B); 115 } 116 } 117 118 extern __inline __m64 119 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120 _mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) { 121 if (__count < 16) { 122 __v2du __C = {__B, __A}; 123 #ifdef __LITTLE_ENDIAN__ 124 const __v4su __shift = {__count << 3, 0, 0, 0}; 125 __C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift); 126 #else 127 const __v4su __shift = {0, 0, 0, __count << 3}; 128 __C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift); 129 #endif 130 return (__m64)__C[0]; 131 } else { 132 const __m64 __zero = {0}; 133 return __zero; 134 } 135 } 136 137 extern __inline __m128i 138 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 139 _mm_hadd_epi16(__m128i __A, __m128i __B) { 140 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 141 16, 17, 20, 21, 24, 25, 28, 29}; 142 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 143 18, 19, 22, 23, 26, 27, 30, 31}; 144 __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P); 145 __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q); 146 return (__m128i)vec_add(__C, __D); 147 } 148 149 extern __inline __m128i 150 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 151 _mm_hadd_epi32(__m128i __A, __m128i __B) { 152 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 153 16, 17, 18, 19, 24, 25, 26, 27}; 154 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 155 20, 21, 22, 23, 28, 29, 30, 31}; 156 __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P); 157 __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q); 158 return (__m128i)vec_add(__C, __D); 159 } 160 161 extern __inline __m64 162 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 163 _mm_hadd_pi16(__m64 __A, __m64 __B) { 164 __v8hi __C = (__v8hi)(__v2du){__A, __B}; 165 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; 166 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15}; 167 __v8hi __D = vec_perm(__C, __C, __Q); 168 __C = vec_perm(__C, __C, __P); 169 __C = vec_add(__C, __D); 170 return (__m64)((__v2du)__C)[1]; 171 } 172 173 extern __inline __m64 174 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 175 _mm_hadd_pi32(__m64 __A, __m64 __B) { 176 __v4si __C = (__v4si)(__v2du){__A, __B}; 177 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11}; 178 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15}; 179 __v4si __D = vec_perm(__C, __C, __Q); 180 __C = vec_perm(__C, __C, __P); 181 __C = vec_add(__C, __D); 182 return (__m64)((__v2du)__C)[1]; 183 } 184 185 extern __inline __m128i 186 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 187 _mm_hadds_epi16(__m128i __A, __m128i __B) { 188 __v4si __C = {0}, __D = {0}; 189 __C = vec_sum4s((__v8hi)__A, __C); 190 __D = vec_sum4s((__v8hi)__B, __D); 191 __C = (__v4si)vec_packs(__C, __D); 192 return (__m128i)__C; 193 } 194 195 extern __inline __m64 196 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 197 _mm_hadds_pi16(__m64 __A, __m64 __B) { 198 const __v4si __zero = {0}; 199 __v8hi __C = (__v8hi)(__v2du){__A, __B}; 200 __v4si __D = vec_sum4s(__C, __zero); 201 __C = vec_packs(__D, __D); 202 return (__m64)((__v2du)__C)[1]; 203 } 204 205 extern __inline __m128i 206 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 207 _mm_hsub_epi16(__m128i __A, __m128i __B) { 208 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 209 16, 17, 20, 21, 24, 25, 28, 29}; 210 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 211 18, 19, 22, 23, 26, 27, 30, 31}; 212 __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P); 213 __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q); 214 return (__m128i)vec_sub(__C, __D); 215 } 216 217 extern __inline __m128i 218 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 219 _mm_hsub_epi32(__m128i __A, __m128i __B) { 220 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 221 16, 17, 18, 19, 24, 25, 26, 27}; 222 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 223 20, 21, 22, 23, 28, 29, 30, 31}; 224 __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P); 225 __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q); 226 return (__m128i)vec_sub(__C, __D); 227 } 228 229 extern __inline __m64 230 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 231 _mm_hsub_pi16(__m64 __A, __m64 __B) { 232 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; 233 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15}; 234 __v8hi __C = (__v8hi)(__v2du){__A, __B}; 235 __v8hi __D = vec_perm(__C, __C, __Q); 236 __C = vec_perm(__C, __C, __P); 237 __C = vec_sub(__C, __D); 238 return (__m64)((__v2du)__C)[1]; 239 } 240 241 extern __inline __m64 242 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 243 _mm_hsub_pi32(__m64 __A, __m64 __B) { 244 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11}; 245 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15}; 246 __v4si __C = (__v4si)(__v2du){__A, __B}; 247 __v4si __D = vec_perm(__C, __C, __Q); 248 __C = vec_perm(__C, __C, __P); 249 __C = vec_sub(__C, __D); 250 return (__m64)((__v2du)__C)[1]; 251 } 252 253 extern __inline __m128i 254 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 255 _mm_hsubs_epi16(__m128i __A, __m128i __B) { 256 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 257 16, 17, 20, 21, 24, 25, 28, 29}; 258 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 259 18, 19, 22, 23, 26, 27, 30, 31}; 260 __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P); 261 __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q); 262 return (__m128i)vec_subs(__C, __D); 263 } 264 265 extern __inline __m64 266 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 267 _mm_hsubs_pi16(__m64 __A, __m64 __B) { 268 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; 269 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15}; 270 __v8hi __C = (__v8hi)(__v2du){__A, __B}; 271 __v8hi __D = vec_perm(__C, __C, __P); 272 __v8hi __E = vec_perm(__C, __C, __Q); 273 __C = vec_subs(__D, __E); 274 return (__m64)((__v2du)__C)[1]; 275 } 276 277 extern __inline __m128i 278 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 279 _mm_shuffle_epi8(__m128i __A, __m128i __B) { 280 const __v16qi __zero = {0}; 281 __vector __bool char __select = vec_cmplt((__v16qi)__B, __zero); 282 __v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B); 283 return (__m128i)vec_sel(__C, __zero, __select); 284 } 285 286 extern __inline __m64 287 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 288 _mm_shuffle_pi8(__m64 __A, __m64 __B) { 289 const __v16qi __zero = {0}; 290 __v16qi __C = (__v16qi)(__v2du){__A, __A}; 291 __v16qi __D = (__v16qi)(__v2du){__B, __B}; 292 __vector __bool char __select = vec_cmplt((__v16qi)__D, __zero); 293 __C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D); 294 __C = vec_sel(__C, __zero, __select); 295 return (__m64)((__v2du)(__C))[0]; 296 } 297 298 #ifdef _ARCH_PWR8 299 extern __inline __m128i 300 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 301 _mm_sign_epi8(__m128i __A, __m128i __B) { 302 const __v16qi __zero = {0}; 303 __v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero); 304 __v16qi __selectpos = 305 (__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero)); 306 __v16qi __conv = vec_add(__selectneg, __selectpos); 307 return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv); 308 } 309 #endif 310 311 #ifdef _ARCH_PWR8 312 extern __inline __m128i 313 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 314 _mm_sign_epi16(__m128i __A, __m128i __B) { 315 const __v8hi __zero = {0}; 316 __v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero); 317 __v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero)); 318 __v8hi __conv = vec_add(__selectneg, __selectpos); 319 return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv); 320 } 321 #endif 322 323 #ifdef _ARCH_PWR8 324 extern __inline __m128i 325 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 326 _mm_sign_epi32(__m128i __A, __m128i __B) { 327 const __v4si __zero = {0}; 328 __v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero); 329 __v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero)); 330 __v4si __conv = vec_add(__selectneg, __selectpos); 331 return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv); 332 } 333 #endif 334 335 #ifdef _ARCH_PWR8 336 extern __inline __m64 337 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 338 _mm_sign_pi8(__m64 __A, __m64 __B) { 339 const __v16qi __zero = {0}; 340 __v16qi __C = (__v16qi)(__v2du){__A, __A}; 341 __v16qi __D = (__v16qi)(__v2du){__B, __B}; 342 __C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D); 343 return (__m64)((__v2du)(__C))[0]; 344 } 345 #endif 346 347 #ifdef _ARCH_PWR8 348 extern __inline __m64 349 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 350 _mm_sign_pi16(__m64 __A, __m64 __B) { 351 const __v8hi __zero = {0}; 352 __v8hi __C = (__v8hi)(__v2du){__A, __A}; 353 __v8hi __D = (__v8hi)(__v2du){__B, __B}; 354 __C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D); 355 return (__m64)((__v2du)(__C))[0]; 356 } 357 #endif 358 359 #ifdef _ARCH_PWR8 360 extern __inline __m64 361 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 362 _mm_sign_pi32(__m64 __A, __m64 __B) { 363 const __v4si __zero = {0}; 364 __v4si __C = (__v4si)(__v2du){__A, __A}; 365 __v4si __D = (__v4si)(__v2du){__B, __B}; 366 __C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D); 367 return (__m64)((__v2du)(__C))[0]; 368 } 369 #endif 370 371 extern __inline __m128i 372 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 373 _mm_maddubs_epi16(__m128i __A, __m128i __B) { 374 __v8hi __unsigned = vec_splats((signed short)0x00ff); 375 __v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned); 376 __v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned); 377 __v8hi __E = vec_unpackh((__v16qi)__B); 378 __v8hi __F = vec_unpackl((__v16qi)__B); 379 __C = vec_mul(__C, __E); 380 __D = vec_mul(__D, __F); 381 const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13, 382 16, 17, 20, 21, 24, 25, 28, 29}; 383 const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15, 384 18, 19, 22, 23, 26, 27, 30, 31}; 385 __E = vec_perm(__C, __D, __odds); 386 __F = vec_perm(__C, __D, __evens); 387 return (__m128i)vec_adds(__E, __F); 388 } 389 390 extern __inline __m64 391 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 392 _mm_maddubs_pi16(__m64 __A, __m64 __B) { 393 __v8hi __C = (__v8hi)(__v2du){__A, __A}; 394 __C = vec_unpackl((__v16qi)__C); 395 const __v8hi __unsigned = vec_splats((signed short)0x00ff); 396 __C = vec_and(__C, __unsigned); 397 __v8hi __D = (__v8hi)(__v2du){__B, __B}; 398 __D = vec_unpackl((__v16qi)__D); 399 __D = vec_mul(__C, __D); 400 const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13, 401 16, 17, 20, 21, 24, 25, 28, 29}; 402 const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15, 403 18, 19, 22, 23, 26, 27, 30, 31}; 404 __C = vec_perm(__D, __D, __odds); 405 __D = vec_perm(__D, __D, __evens); 406 __C = vec_adds(__C, __D); 407 return (__m64)((__v2du)(__C))[0]; 408 } 409 410 extern __inline __m128i 411 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 412 _mm_mulhrs_epi16(__m128i __A, __m128i __B) { 413 __v4si __C = vec_unpackh((__v8hi)__A); 414 __v4si __D = vec_unpackh((__v8hi)__B); 415 __C = vec_mul(__C, __D); 416 __D = vec_unpackl((__v8hi)__A); 417 __v4si __E = vec_unpackl((__v8hi)__B); 418 __D = vec_mul(__D, __E); 419 const __v4su __shift = vec_splats((unsigned int)14); 420 __C = vec_sr(__C, __shift); 421 __D = vec_sr(__D, __shift); 422 const __v4si __ones = vec_splats((signed int)1); 423 __C = vec_add(__C, __ones); 424 __C = vec_sr(__C, (__v4su)__ones); 425 __D = vec_add(__D, __ones); 426 __D = vec_sr(__D, (__v4su)__ones); 427 return (__m128i)vec_pack(__C, __D); 428 } 429 430 extern __inline __m64 431 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 432 _mm_mulhrs_pi16(__m64 __A, __m64 __B) { 433 __v4si __C = (__v4si)(__v2du){__A, __A}; 434 __C = vec_unpackh((__v8hi)__C); 435 __v4si __D = (__v4si)(__v2du){__B, __B}; 436 __D = vec_unpackh((__v8hi)__D); 437 __C = vec_mul(__C, __D); 438 const __v4su __shift = vec_splats((unsigned int)14); 439 __C = vec_sr(__C, __shift); 440 const __v4si __ones = vec_splats((signed int)1); 441 __C = vec_add(__C, __ones); 442 __C = vec_sr(__C, (__v4su)__ones); 443 __v8hi __E = vec_pack(__C, __D); 444 return (__m64)((__v2du)(__E))[0]; 445 } 446 447 #else 448 #include_next <tmmintrin.h> 449 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ 450 */ 451 452 #endif /* TMMINTRIN_H_ */ 453