1 #pragma once 2 #ifndef PSIMD_H 3 #define PSIMD_H 4 5 #if defined(__CUDA_ARCH__) 6 /* CUDA compiler */ 7 #define PSIMD_INTRINSIC __forceinline__ __device__ 8 #elif defined(__OPENCL_VERSION__) 9 /* OpenCL compiler */ 10 #define PSIMD_INTRINSIC inline static 11 #elif defined(__INTEL_COMPILER) 12 /* Intel compiler, even on Windows */ 13 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__)) 14 #elif defined(__GNUC__) 15 /* GCC-compatible compiler (gcc/clang/icc) */ 16 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__)) 17 #elif defined(_MSC_VER) 18 /* MSVC-compatible compiler (cl/icl/clang-cl) */ 19 #define PSIMD_INTRINSIC __forceinline static 20 #elif defined(__cplusplus) 21 /* Generic C++ compiler */ 22 #define PSIMD_INTRINSIC inline static 23 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) 24 /* Generic C99 compiler */ 25 #define PSIMD_INTRINSIC inline static 26 #else 27 /* Generic C compiler */ 28 #define PSIMD_INTRINSIC static 29 #endif 30 31 #if defined(__GNUC__) 32 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 33 #include <arm_neon.h> 34 #endif 35 36 #if defined(__SSE2__) 37 #include <emmintrin.h> 38 #endif 39 40 #if defined(__SSE3__) 41 #include <pmmintrin.h> 42 #endif 43 44 #if defined(__SSSE3__) 45 #include <tmmintrin.h> 46 #endif 47 48 #if defined(__SSE4_1__) 49 #include <smmintrin.h> 50 #endif 51 52 #if defined(__SSE4_2__) 53 #include <nmmintrin.h> 54 #endif 55 56 #if defined(__AVX__) 57 #include <immintrin.h> 58 #endif 59 #elif defined(_MSC_VER) 60 #include <intrin.h> 61 #endif 62 63 #if defined(__cplusplus) 64 #define PSIMD_CXX_SYNTAX 65 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) 66 #define PSIMD_C11_SYNTAX 67 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) 68 #define PSIMD_C99_SYNTAX 69 #else 70 #define PSIMD_C89_SYNTAX 71 #endif 72 73 #if defined(__cplusplus) && (__cplusplus >= 201103L) 74 #include <cstddef> 75 #include <cstdint> 76 #elif !defined(__OPENCL_VERSION__) 77 #include <stddef.h> 78 #include <stdint.h> 79 #endif 80 81 #if defined(__GNUC__) 82 #define PSIMD_HAVE_F64 0 83 #define PSIMD_HAVE_F32 1 84 #define PSIMD_HAVE_U8 1 85 #define PSIMD_HAVE_S8 1 86 #define PSIMD_HAVE_U16 1 87 #define PSIMD_HAVE_S16 1 88 #define PSIMD_HAVE_U32 1 89 #define PSIMD_HAVE_S32 1 90 #define PSIMD_HAVE_U64 0 91 #define PSIMD_HAVE_S64 0 92 93 typedef int8_t psimd_s8 __attribute__((vector_size(16), aligned(1))); 94 typedef uint8_t psimd_u8 __attribute__((vector_size(16), aligned(1))); 95 typedef int16_t psimd_s16 __attribute__((vector_size(16), aligned(2))); 96 typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2))); 97 typedef int32_t psimd_s32 __attribute__((vector_size(16), aligned(4))); 98 typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4))); 99 typedef float psimd_f32 __attribute__((vector_size(16), aligned(4))); 100 101 typedef struct { 102 psimd_s8 lo; 103 psimd_s8 hi; 104 } psimd_s8x2; 105 106 typedef struct { 107 psimd_u8 lo; 108 psimd_u8 hi; 109 } psimd_u8x2; 110 111 typedef struct { 112 psimd_s16 lo; 113 psimd_s16 hi; 114 } psimd_s16x2; 115 116 typedef struct { 117 psimd_u16 lo; 118 psimd_u16 hi; 119 } psimd_u16x2; 120 121 typedef struct { 122 psimd_s32 lo; 123 psimd_s32 hi; 124 } psimd_s32x2; 125 126 typedef struct { 127 psimd_u32 lo; 128 psimd_u32 hi; 129 } psimd_u32x2; 130 131 typedef struct { 132 psimd_f32 lo; 133 psimd_f32 hi; 134 } psimd_f32x2; 135 136 /* Bit casts */ psimd_cast_s32x2_u32x2(psimd_s32x2 v)137 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) { 138 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi }; 139 } 140 psimd_cast_s32x2_f32x2(psimd_s32x2 v)141 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) { 142 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi }; 143 } 144 psimd_cast_u32x2_s32x2(psimd_u32x2 v)145 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) { 146 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi }; 147 } 148 psimd_cast_u32x2_f32x2(psimd_u32x2 v)149 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) { 150 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi }; 151 } 152 psimd_cast_f32x2_s32x2(psimd_f32x2 v)153 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) { 154 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi }; 155 } 156 psimd_cast_f32x2_u32x2(psimd_f32x2 v)157 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) { 158 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi }; 159 } 160 161 /* Swap */ psimd_swap_s8(psimd_s8 a[1],psimd_s8 b[1])162 PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) { 163 const psimd_s8 new_a = *b; 164 const psimd_s8 new_b = *a; 165 *a = new_a; 166 *b = new_b; 167 } 168 psimd_swap_u8(psimd_u8 a[1],psimd_u8 b[1])169 PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) { 170 const psimd_u8 new_a = *b; 171 const psimd_u8 new_b = *a; 172 *a = new_a; 173 *b = new_b; 174 } 175 psimd_swap_s16(psimd_s16 a[1],psimd_s16 b[1])176 PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) { 177 const psimd_s16 new_a = *b; 178 const psimd_s16 new_b = *a; 179 *a = new_a; 180 *b = new_b; 181 } 182 psimd_swap_u16(psimd_u16 a[1],psimd_u16 b[1])183 PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) { 184 const psimd_u16 new_a = *b; 185 const psimd_u16 new_b = *a; 186 *a = new_a; 187 *b = new_b; 188 } 189 psimd_swap_s32(psimd_s32 a[1],psimd_s32 b[1])190 PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) { 191 const psimd_s32 new_a = *b; 192 const psimd_s32 new_b = *a; 193 *a = new_a; 194 *b = new_b; 195 } 196 psimd_swap_u32(psimd_u32 a[1],psimd_u32 b[1])197 PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) { 198 const psimd_u32 new_a = *b; 199 const psimd_u32 new_b = *a; 200 *a = new_a; 201 *b = new_b; 202 } 203 psimd_swap_f32(psimd_f32 a[1],psimd_f32 b[1])204 PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) { 205 const psimd_f32 new_a = *b; 206 const psimd_f32 new_b = *a; 207 *a = new_a; 208 *b = new_b; 209 } 210 211 /* Zero-initialization */ psimd_zero_s8(void)212 PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) { 213 return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 214 } 215 psimd_zero_u8(void)216 PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) { 217 return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 218 } 219 psimd_zero_s16(void)220 PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) { 221 return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 }; 222 } 223 psimd_zero_u16(void)224 PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) { 225 return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 }; 226 } 227 psimd_zero_s32(void)228 PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) { 229 return (psimd_s32) { 0, 0, 0, 0 }; 230 } 231 psimd_zero_u32(void)232 PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) { 233 return (psimd_u32) { 0, 0, 0, 0 }; 234 } 235 psimd_zero_f32(void)236 PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) { 237 return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f }; 238 } 239 240 /* Initialization to the same constant */ psimd_splat_s8(int8_t c)241 PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) { 242 return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }; 243 } 244 psimd_splat_u8(uint8_t c)245 PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) { 246 return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }; 247 } 248 psimd_splat_s16(int16_t c)249 PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) { 250 return (psimd_s16) { c, c, c, c, c, c, c, c }; 251 } 252 psimd_splat_u16(uint16_t c)253 PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) { 254 return (psimd_u16) { c, c, c, c, c, c, c, c }; 255 } 256 psimd_splat_s32(int32_t c)257 PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) { 258 return (psimd_s32) { c, c, c, c }; 259 } 260 psimd_splat_u32(uint32_t c)261 PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) { 262 return (psimd_u32) { c, c, c, c }; 263 } 264 psimd_splat_f32(float c)265 PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) { 266 return (psimd_f32) { c, c, c, c }; 267 } 268 269 /* Load vector */ psimd_load_s8(const void * address)270 PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) { 271 return *((const psimd_s8*) address); 272 } 273 psimd_load_u8(const void * address)274 PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) { 275 return *((const psimd_u8*) address); 276 } 277 psimd_load_s16(const void * address)278 PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) { 279 return *((const psimd_s16*) address); 280 } 281 psimd_load_u16(const void * address)282 PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) { 283 return *((const psimd_u16*) address); 284 } 285 psimd_load_s32(const void * address)286 PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) { 287 return *((const psimd_s32*) address); 288 } 289 psimd_load_u32(const void * address)290 PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) { 291 return *((const psimd_u32*) address); 292 } 293 psimd_load_f32(const void * address)294 PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) { 295 return *((const psimd_f32*) address); 296 } 297 psimd_load1_f32(const void * address)298 PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) { 299 return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f }; 300 } 301 psimd_load2_f32(const void * address)302 PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) { 303 const float* address_f32 = (const float*) address; 304 return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f }; 305 } 306 psimd_load3_f32(const void * address)307 PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) { 308 const float* address_f32 = (const float*) address; 309 return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f }; 310 } 311 psimd_load4_f32(const void * address)312 PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) { 313 return psimd_load_f32(address); 314 } 315 psimd_load_stride2_f32(const void * address)316 PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) { 317 const psimd_f32 v0x1x = psimd_load_f32(address); 318 const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3); 319 #if defined(__clang__) 320 return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7); 321 #else 322 return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 }); 323 #endif 324 } 325 psimd_load1_stride2_f32(const void * address)326 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) { 327 return psimd_load_f32(address); 328 } 329 psimd_load2_stride2_f32(const void * address)330 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) { 331 const float* address_f32 = (const float*) address; 332 return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f }; 333 } 334 psimd_load3_stride2_f32(const void * address)335 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) { 336 const psimd_f32 v0x1x = psimd_load_f32(address); 337 const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2); 338 #if defined(__clang__) 339 return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6); 340 #else 341 return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 }); 342 #endif 343 } 344 psimd_load4_stride2_f32(const void * address)345 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) { 346 return psimd_load_stride2_f32(address); 347 } 348 psimd_load_stride_f32(const void * address,size_t stride)349 PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) { 350 const float* address0_f32 = (const float*) address; 351 const float* address1_f32 = address0_f32 + stride; 352 const float* address2_f32 = address1_f32 + stride; 353 const float* address3_f32 = address2_f32 + stride; 354 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 }; 355 } 356 psimd_load1_stride_f32(const void * address,size_t stride)357 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) { 358 return psimd_load1_f32(address); 359 } 360 psimd_load2_stride_f32(const void * address,size_t stride)361 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) { 362 const float* address_f32 = (const float*) address; 363 return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f }; 364 } 365 psimd_load3_stride_f32(const void * address,size_t stride)366 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) { 367 const float* address0_f32 = (const float*) address; 368 const float* address1_f32 = address0_f32 + stride; 369 const float* address2_f32 = address1_f32 + stride; 370 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f }; 371 } 372 psimd_load4_stride_f32(const void * address,size_t stride)373 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) { 374 return psimd_load_stride_f32(address, stride); 375 } 376 377 /* Store vector */ psimd_store_s8(void * address,psimd_s8 value)378 PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) { 379 *((psimd_s8*) address) = value; 380 } 381 psimd_store_u8(void * address,psimd_u8 value)382 PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) { 383 *((psimd_u8*) address) = value; 384 } 385 psimd_store_s16(void * address,psimd_s16 value)386 PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) { 387 *((psimd_s16*) address) = value; 388 } 389 psimd_store_u16(void * address,psimd_u16 value)390 PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) { 391 *((psimd_u16*) address) = value; 392 } 393 psimd_store_s32(void * address,psimd_s32 value)394 PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) { 395 *((psimd_s32*) address) = value; 396 } 397 psimd_store_u32(void * address,psimd_u32 value)398 PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) { 399 *((psimd_u32*) address) = value; 400 } 401 psimd_store_f32(void * address,psimd_f32 value)402 PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) { 403 *((psimd_f32*) address) = value; 404 } 405 psimd_store1_f32(void * address,psimd_f32 value)406 PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) { 407 *((float*) address) = value[0]; 408 } 409 psimd_store2_f32(void * address,psimd_f32 value)410 PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) { 411 float* address_f32 = (float*) address; 412 address_f32[0] = value[0]; 413 address_f32[1] = value[1]; 414 } 415 psimd_store3_f32(void * address,psimd_f32 value)416 PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) { 417 float* address_f32 = (float*) address; 418 address_f32[0] = value[0]; 419 address_f32[1] = value[1]; 420 address_f32[2] = value[2]; 421 } 422 psimd_store4_f32(void * address,psimd_f32 value)423 PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) { 424 psimd_store_f32(address, value); 425 } 426 psimd_store_stride_f32(void * address,size_t stride,psimd_f32 value)427 PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) { 428 float* address0_f32 = (float*) address; 429 float* address1_f32 = address0_f32 + stride; 430 float* address2_f32 = address1_f32 + stride; 431 float* address3_f32 = address2_f32 + stride; 432 *address0_f32 = value[0]; 433 *address1_f32 = value[1]; 434 *address2_f32 = value[2]; 435 *address3_f32 = value[3]; 436 } 437 psimd_store1_stride_f32(void * address,size_t stride,psimd_f32 value)438 PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) { 439 psimd_store1_f32(address, value); 440 } 441 psimd_store2_stride_f32(void * address,size_t stride,psimd_f32 value)442 PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) { 443 float* address_f32 = (float*) address; 444 address_f32[0] = value[0]; 445 address_f32[stride] = value[1]; 446 } 447 psimd_store3_stride_f32(void * address,size_t stride,psimd_f32 value)448 PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) { 449 float* address0_f32 = (float*) address; 450 float* address1_f32 = address0_f32 + stride; 451 float* address2_f32 = address1_f32 + stride; 452 *address0_f32 = value[0]; 453 *address1_f32 = value[1]; 454 *address2_f32 = value[2]; 455 } 456 457 /* Vector addition */ psimd_add_s8(psimd_s8 a,psimd_s8 b)458 PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) { 459 return a + b; 460 } 461 psimd_add_u8(psimd_u8 a,psimd_u8 b)462 PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) { 463 return a + b; 464 } 465 psimd_add_s16(psimd_s16 a,psimd_s16 b)466 PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) { 467 return a + b; 468 } 469 psimd_add_u16(psimd_u16 a,psimd_u16 b)470 PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) { 471 return a + b; 472 } 473 psimd_add_s32(psimd_s32 a,psimd_s32 b)474 PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) { 475 return a + b; 476 } 477 psimd_add_u32(psimd_u32 a,psimd_u32 b)478 PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) { 479 return a + b; 480 } 481 psimd_add_f32(psimd_f32 a,psimd_f32 b)482 PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) { 483 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) 484 return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b); 485 #else 486 return a + b; 487 #endif 488 } 489 490 /* Vector subtraction */ psimd_sub_s8(psimd_s8 a,psimd_s8 b)491 PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) { 492 return a - b; 493 } 494 psimd_sub_u8(psimd_u8 a,psimd_u8 b)495 PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) { 496 return a - b; 497 } 498 psimd_sub_s16(psimd_s16 a,psimd_s16 b)499 PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) { 500 return a - b; 501 } 502 psimd_sub_u16(psimd_u16 a,psimd_u16 b)503 PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) { 504 return a - b; 505 } 506 psimd_sub_s32(psimd_s32 a,psimd_s32 b)507 PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) { 508 return a - b; 509 } 510 psimd_sub_u32(psimd_u32 a,psimd_u32 b)511 PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) { 512 return a - b; 513 } 514 psimd_sub_f32(psimd_f32 a,psimd_f32 b)515 PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) { 516 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) 517 return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b); 518 #else 519 return a - b; 520 #endif 521 } 522 523 /* Vector multiplication */ psimd_mul_s8(psimd_s8 a,psimd_s8 b)524 PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) { 525 return a * b; 526 } 527 psimd_mul_u8(psimd_u8 a,psimd_u8 b)528 PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) { 529 return a * b; 530 } 531 psimd_mul_s16(psimd_s16 a,psimd_s16 b)532 PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) { 533 return a * b; 534 } 535 psimd_mul_u16(psimd_u16 a,psimd_u16 b)536 PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) { 537 return a * b; 538 } 539 psimd_mul_s32(psimd_s32 a,psimd_s32 b)540 PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) { 541 return a * b; 542 } 543 psimd_mul_u32(psimd_u32 a,psimd_u32 b)544 PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) { 545 return a * b; 546 } 547 psimd_mul_f32(psimd_f32 a,psimd_f32 b)548 PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) { 549 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) 550 return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b); 551 #else 552 return a * b; 553 #endif 554 } 555 556 /* Vector and */ psimd_andmask_f32(psimd_s32 mask,psimd_f32 v)557 PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) { 558 return (psimd_f32) (mask & (psimd_s32) v); 559 } 560 561 /* Vector blend */ psimd_blend_s8(psimd_s8 mask,psimd_s8 a,psimd_s8 b)562 PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) { 563 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 564 return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b); 565 #else 566 return (mask & a) | (~mask & b); 567 #endif 568 } 569 psimd_blend_u8(psimd_u8 mask,psimd_u8 a,psimd_u8 b)570 PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_u8 mask, psimd_u8 a, psimd_u8 b) { 571 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 572 return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b); 573 #else 574 return (mask & a) | (~mask & b); 575 #endif 576 } 577 psimd_blend_s16(psimd_s16 mask,psimd_s16 a,psimd_s16 b)578 PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) { 579 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 580 return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b); 581 #else 582 return (mask & a) | (~mask & b); 583 #endif 584 } 585 psimd_blend_u16(psimd_u16 mask,psimd_u16 a,psimd_u16 b)586 PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_u16 mask, psimd_u16 a, psimd_u16 b) { 587 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 588 return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b); 589 #else 590 return (mask & a) | (~mask & b); 591 #endif 592 } 593 psimd_blend_s32(psimd_s32 mask,psimd_s32 a,psimd_s32 b)594 PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) { 595 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 596 return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b); 597 #else 598 return (mask & a) | (~mask & b); 599 #endif 600 } 601 psimd_blend_u32(psimd_u32 mask,psimd_u32 a,psimd_u32 b)602 PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_u32 mask, psimd_u32 a, psimd_u32 b) { 603 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 604 return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b); 605 #else 606 return (mask & a) | (~mask & b); 607 #endif 608 } 609 psimd_blend_f32(psimd_s32 mask,psimd_f32 a,psimd_f32 b)610 PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) { 611 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 612 return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b); 613 #else 614 return (psimd_f32) psimd_blend_s32(mask, (psimd_s32) a, (psimd_s32) b); 615 #endif 616 } 617 618 /* Vector blend on sign */ psimd_signblend_s8(psimd_s8 x,psimd_s8 a,psimd_s8 b)619 PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) { 620 return psimd_blend_s8(x >> psimd_splat_s8(7), a, b); 621 } 622 psimd_signblend_u8(psimd_s8 x,psimd_u8 a,psimd_u8 b)623 PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) { 624 return psimd_blend_u8((psimd_u8) (x >> psimd_splat_s8(7)), a, b); 625 } 626 psimd_signblend_s16(psimd_s16 x,psimd_s16 a,psimd_s16 b)627 PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) { 628 return psimd_blend_s16(x >> psimd_splat_s16(15), a, b); 629 } 630 psimd_signblend_u16(psimd_s16 x,psimd_u16 a,psimd_u16 b)631 PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) { 632 return psimd_blend_u16((psimd_u16) (x >> psimd_splat_s16(15)), a, b); 633 } 634 psimd_signblend_s32(psimd_s32 x,psimd_s32 a,psimd_s32 b)635 PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) { 636 return psimd_blend_s32(x >> psimd_splat_s32(31), a, b); 637 } 638 psimd_signblend_u32(psimd_s32 x,psimd_u32 a,psimd_u32 b)639 PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) { 640 return psimd_blend_u32((psimd_u32) (x >> psimd_splat_s32(31)), a, b); 641 } 642 psimd_signblend_f32(psimd_f32 x,psimd_f32 a,psimd_f32 b)643 PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) { 644 const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31); 645 return psimd_blend_f32(mask, a, b); 646 } 647 648 /* Vector absolute value */ psimd_abs_f32(psimd_f32 v)649 PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) { 650 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f); 651 return (psimd_f32) ((psimd_s32) v & mask); 652 } 653 654 /* Vector negation */ psimd_neg_f32(psimd_f32 v)655 PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) { 656 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f); 657 return (psimd_f32) ((psimd_s32) v ^ mask); 658 } 659 660 /* Vector maximum */ psimd_max_s8(psimd_s8 a,psimd_s8 b)661 PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) { 662 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 663 return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b); 664 #else 665 return psimd_blend_s8(a > b, a, b); 666 #endif 667 } 668 psimd_max_u8(psimd_u8 a,psimd_u8 b)669 PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) { 670 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 671 return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b); 672 #else 673 return psimd_blend_u8(a > b, a, b); 674 #endif 675 } 676 psimd_max_s16(psimd_s16 a,psimd_s16 b)677 PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) { 678 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 679 return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b); 680 #else 681 return psimd_blend_s16(a > b, a, b); 682 #endif 683 } 684 psimd_max_u16(psimd_u16 a,psimd_u16 b)685 PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) { 686 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 687 return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b); 688 #else 689 return psimd_blend_u16(a > b, a, b); 690 #endif 691 } 692 psimd_max_s32(psimd_s32 a,psimd_s32 b)693 PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) { 694 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 695 return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b); 696 #else 697 return psimd_blend_s32(a > b, a, b); 698 #endif 699 } 700 psimd_max_u32(psimd_u32 a,psimd_u32 b)701 PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) { 702 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 703 return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b); 704 #else 705 return psimd_blend_u32(a > b, a, b); 706 #endif 707 } 708 psimd_max_f32(psimd_f32 a,psimd_f32 b)709 PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) { 710 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 711 return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b); 712 #else 713 return psimd_blend_f32(a > b, a, b); 714 #endif 715 } 716 717 /* Vector minimum */ psimd_min_s8(psimd_s8 a,psimd_s8 b)718 PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) { 719 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 720 return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b); 721 #else 722 return psimd_blend_s8(a < b, a, b); 723 #endif 724 } 725 psimd_min_u8(psimd_u8 a,psimd_u8 b)726 PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) { 727 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 728 return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b); 729 #else 730 return psimd_blend_u8(a < b, a, b); 731 #endif 732 } 733 psimd_min_s16(psimd_s16 a,psimd_s16 b)734 PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) { 735 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 736 return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b); 737 #else 738 return psimd_blend_s16(a < b, a, b); 739 #endif 740 } 741 psimd_min_u16(psimd_u16 a,psimd_u16 b)742 PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) { 743 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 744 return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b); 745 #else 746 return psimd_blend_u16(a < b, a, b); 747 #endif 748 } 749 psimd_min_s32(psimd_s32 a,psimd_s32 b)750 PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) { 751 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 752 return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b); 753 #else 754 return psimd_blend_s32(a < b, a, b); 755 #endif 756 } 757 psimd_min_u32(psimd_u32 a,psimd_u32 b)758 PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) { 759 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 760 return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b); 761 #else 762 return psimd_blend_u32(a < b, a, b); 763 #endif 764 } 765 psimd_min_f32(psimd_f32 a,psimd_f32 b)766 PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) { 767 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 768 return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b); 769 #else 770 return psimd_blend_f32(a < b, a, b); 771 #endif 772 } 773 psimd_cvt_s32_f32(psimd_s32 v)774 PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) { 775 #if defined(__clang__) 776 return __builtin_convertvector(v, psimd_f32); 777 #elif defined(__ARM_NEON__) || defined(__ARM_NEON) 778 return (psimd_f32) vcvtq_f32_s32((int32x4_t) v); 779 #elif defined(__SSE2__) 780 return (psimd_f32) _mm_cvtepi32_ps((__m128i) v); 781 #else 782 return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] }; 783 #endif 784 } 785 786 /* Broadcast vector element */ 787 #if defined(__clang__) psimd_splat0_f32(psimd_f32 v)788 PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) { 789 return __builtin_shufflevector(v, v, 0, 0, 0, 0); 790 } 791 psimd_splat1_f32(psimd_f32 v)792 PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) { 793 return __builtin_shufflevector(v, v, 1, 1, 1, 1); 794 } 795 psimd_splat2_f32(psimd_f32 v)796 PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) { 797 return __builtin_shufflevector(v, v, 2, 2, 2, 2); 798 } 799 psimd_splat3_f32(psimd_f32 v)800 PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) { 801 return __builtin_shufflevector(v, v, 3, 3, 3, 3); 802 } 803 #else psimd_splat0_f32(psimd_f32 v)804 PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) { 805 return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 }); 806 } 807 psimd_splat1_f32(psimd_f32 v)808 PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) { 809 return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 }); 810 } 811 psimd_splat2_f32(psimd_f32 v)812 PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) { 813 return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 }); 814 } 815 psimd_splat3_f32(psimd_f32 v)816 PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) { 817 return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 }); 818 } 819 #endif 820 821 /* Reversal of vector elements */ 822 #if defined(__clang__) psimd_reverse_s8(psimd_s8 v)823 PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) { 824 return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 825 } 826 psimd_reverse_u8(psimd_u8 v)827 PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) { 828 return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 829 } 830 psimd_reverse_s16(psimd_s16 v)831 PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) { 832 return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0); 833 } 834 psimd_reverse_u16(psimd_u16 v)835 PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) { 836 return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0); 837 } 838 psimd_reverse_s32(psimd_s32 v)839 PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) { 840 return __builtin_shufflevector(v, v, 3, 2, 1, 0); 841 } 842 psimd_reverse_u32(psimd_u32 v)843 PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) { 844 return __builtin_shufflevector(v, v, 3, 2, 1, 0); 845 } 846 psimd_reverse_f32(psimd_f32 v)847 PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) { 848 return __builtin_shufflevector(v, v, 3, 2, 1, 0); 849 } 850 #else psimd_reverse_s8(psimd_s8 v)851 PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) { 852 return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }); 853 } 854 psimd_reverse_u8(psimd_u8 v)855 PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) { 856 return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }); 857 } 858 psimd_reverse_s16(psimd_s16 v)859 PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) { 860 return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 }); 861 } 862 psimd_reverse_u16(psimd_u16 v)863 PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) { 864 return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 }); 865 } 866 psimd_reverse_s32(psimd_s32 v)867 PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) { 868 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); 869 } 870 psimd_reverse_u32(psimd_u32 v)871 PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) { 872 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); 873 } 874 psimd_reverse_f32(psimd_f32 v)875 PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) { 876 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); 877 } 878 #endif 879 880 /* Interleaving of vector elements */ 881 #if defined(__clang__) psimd_interleave_lo_s16(psimd_s16 a,psimd_s16 b)882 PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) { 883 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 884 } 885 psimd_interleave_hi_s16(psimd_s16 a,psimd_s16 b)886 PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) { 887 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 888 } 889 psimd_interleave_lo_u16(psimd_u16 a,psimd_u16 b)890 PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) { 891 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 892 } 893 psimd_interleave_hi_u16(psimd_u16 a,psimd_u16 b)894 PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) { 895 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 896 } 897 psimd_interleave_lo_s32(psimd_s32 a,psimd_s32 b)898 PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) { 899 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); 900 } 901 psimd_interleave_hi_s32(psimd_s32 a,psimd_s32 b)902 PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) { 903 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); 904 } 905 psimd_interleave_lo_u32(psimd_u32 a,psimd_u32 b)906 PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) { 907 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); 908 } 909 psimd_interleave_hi_u32(psimd_u32 a,psimd_u32 b)910 PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) { 911 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); 912 } 913 psimd_interleave_lo_f32(psimd_f32 a,psimd_f32 b)914 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) { 915 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); 916 } 917 psimd_interleave_hi_f32(psimd_f32 a,psimd_f32 b)918 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) { 919 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); 920 } 921 #else psimd_interleave_lo_s16(psimd_s16 a,psimd_s16 b)922 PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) { 923 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 }); 924 } 925 psimd_interleave_hi_s16(psimd_s16 a,psimd_s16 b)926 PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) { 927 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 }); 928 } 929 psimd_interleave_lo_u16(psimd_u16 a,psimd_u16 b)930 PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) { 931 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 }); 932 } 933 psimd_interleave_hi_u16(psimd_u16 a,psimd_u16 b)934 PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) { 935 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 }); 936 } 937 psimd_interleave_lo_s32(psimd_s32 a,psimd_s32 b)938 PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) { 939 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); 940 } 941 psimd_interleave_hi_s32(psimd_s32 a,psimd_s32 b)942 PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) { 943 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); 944 } 945 psimd_interleave_lo_u32(psimd_u32 a,psimd_u32 b)946 PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) { 947 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); 948 } 949 psimd_interleave_hi_u32(psimd_u32 a,psimd_u32 b)950 PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) { 951 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); 952 } 953 psimd_interleave_lo_f32(psimd_f32 a,psimd_f32 b)954 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) { 955 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); 956 } 957 psimd_interleave_hi_f32(psimd_f32 a,psimd_f32 b)958 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) { 959 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); 960 } 961 #endif 962 963 /* Concatenation of low/high vector elements */ 964 #if defined(__clang__) psimd_concat_lo_s16(psimd_s16 a,psimd_s16 b)965 PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) { 966 return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3); 967 } 968 psimd_concat_hi_s16(psimd_s16 a,psimd_s16 b)969 PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) { 970 return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7); 971 } 972 psimd_concat_lo_u16(psimd_u16 a,psimd_u16 b)973 PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) { 974 return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3); 975 } 976 psimd_concat_hi_u16(psimd_u16 a,psimd_u16 b)977 PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) { 978 return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7); 979 } 980 psimd_concat_lo_s32(psimd_s32 a,psimd_s32 b)981 PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) { 982 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); 983 } 984 psimd_concat_hi_s32(psimd_s32 a,psimd_s32 b)985 PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) { 986 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); 987 } 988 psimd_concat_lo_u32(psimd_u32 a,psimd_u32 b)989 PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) { 990 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); 991 } 992 psimd_concat_hi_u32(psimd_u32 a,psimd_u32 b)993 PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) { 994 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); 995 } 996 psimd_concat_lo_f32(psimd_f32 a,psimd_f32 b)997 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) { 998 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); 999 } 1000 psimd_concat_hi_f32(psimd_f32 a,psimd_f32 b)1001 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) { 1002 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); 1003 } 1004 #else psimd_concat_lo_s16(psimd_s16 a,psimd_s16 b)1005 PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) { 1006 return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 }); 1007 } 1008 psimd_concat_hi_s16(psimd_s16 a,psimd_s16 b)1009 PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) { 1010 return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 }); 1011 } 1012 psimd_concat_lo_u16(psimd_u16 a,psimd_u16 b)1013 PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) { 1014 return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 }); 1015 } 1016 psimd_concat_hi_u16(psimd_u16 a,psimd_u16 b)1017 PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) { 1018 return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 }); 1019 } 1020 psimd_concat_lo_s32(psimd_s32 a,psimd_s32 b)1021 PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) { 1022 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); 1023 } 1024 psimd_concat_hi_s32(psimd_s32 a,psimd_s32 b)1025 PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) { 1026 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); 1027 } 1028 psimd_concat_lo_u32(psimd_u32 a,psimd_u32 b)1029 PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) { 1030 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); 1031 } 1032 psimd_concat_hi_u32(psimd_u32 a,psimd_u32 b)1033 PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) { 1034 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); 1035 } 1036 psimd_concat_lo_f32(psimd_f32 a,psimd_f32 b)1037 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) { 1038 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); 1039 } 1040 psimd_concat_hi_f32(psimd_f32 a,psimd_f32 b)1041 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) { 1042 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); 1043 } 1044 #endif 1045 1046 /* Concatenation of even/odd vector elements */ 1047 #if defined(__clang__) psimd_concat_even_s8(psimd_s8 a,psimd_s8 b)1048 PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) { 1049 return __builtin_shufflevector(a, b, 1050 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14); 1051 } 1052 psimd_concat_odd_s8(psimd_s8 a,psimd_s8 b)1053 PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) { 1054 return __builtin_shufflevector(a, b, 1055 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15); 1056 } 1057 psimd_concat_even_u8(psimd_u8 a,psimd_u8 b)1058 PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) { 1059 return __builtin_shufflevector(a, b, 1060 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14); 1061 } 1062 psimd_concat_odd_u8(psimd_u8 a,psimd_u8 b)1063 PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) { 1064 return __builtin_shufflevector(a, b, 1065 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15); 1066 } 1067 psimd_concat_even_s16(psimd_s16 a,psimd_s16 b)1068 PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) { 1069 return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6); 1070 } 1071 psimd_concat_odd_s16(psimd_s16 a,psimd_s16 b)1072 PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) { 1073 return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7); 1074 } 1075 psimd_concat_even_u16(psimd_u16 a,psimd_u16 b)1076 PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) { 1077 return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6); 1078 } 1079 psimd_concat_odd_u16(psimd_u16 a,psimd_u16 b)1080 PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) { 1081 return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7); 1082 } 1083 psimd_concat_even_s32(psimd_s32 a,psimd_s32 b)1084 PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) { 1085 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); 1086 } 1087 psimd_concat_odd_s32(psimd_s32 a,psimd_s32 b)1088 PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) { 1089 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); 1090 } 1091 psimd_concat_even_u32(psimd_u32 a,psimd_u32 b)1092 PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) { 1093 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); 1094 } 1095 psimd_concat_odd_u32(psimd_u32 a,psimd_u32 b)1096 PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) { 1097 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); 1098 } 1099 psimd_concat_even_f32(psimd_f32 a,psimd_f32 b)1100 PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) { 1101 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); 1102 } 1103 psimd_concat_odd_f32(psimd_f32 a,psimd_f32 b)1104 PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) { 1105 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); 1106 } 1107 #else psimd_concat_even_s8(psimd_s8 a,psimd_s8 b)1108 PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) { 1109 return __builtin_shuffle(a, b, 1110 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 }); 1111 } 1112 psimd_concat_odd_s8(psimd_s8 a,psimd_s8 b)1113 PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) { 1114 return __builtin_shuffle(a, b, 1115 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 }); 1116 } 1117 psimd_concat_even_u8(psimd_u8 a,psimd_u8 b)1118 PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) { 1119 return __builtin_shuffle(a, b, 1120 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 }); 1121 } 1122 psimd_concat_odd_u8(psimd_u8 a,psimd_u8 b)1123 PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) { 1124 return __builtin_shuffle(a, b, 1125 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 }); 1126 } 1127 psimd_concat_even_s16(psimd_s16 a,psimd_s16 b)1128 PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) { 1129 return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 }); 1130 } 1131 psimd_concat_odd_s16(psimd_s16 a,psimd_s16 b)1132 PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) { 1133 return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 }); 1134 } 1135 psimd_concat_even_u16(psimd_u16 a,psimd_u16 b)1136 PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) { 1137 return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 }); 1138 } 1139 psimd_concat_odd_u16(psimd_u16 a,psimd_u16 b)1140 PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) { 1141 return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 }); 1142 } 1143 psimd_concat_even_s32(psimd_s32 a,psimd_s32 b)1144 PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) { 1145 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); 1146 } 1147 psimd_concat_odd_s32(psimd_s32 a,psimd_s32 b)1148 PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) { 1149 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); 1150 } 1151 psimd_concat_even_u32(psimd_u32 a,psimd_u32 b)1152 PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) { 1153 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); 1154 } 1155 psimd_concat_odd_u32(psimd_u32 a,psimd_u32 b)1156 PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) { 1157 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); 1158 } 1159 psimd_concat_even_f32(psimd_f32 a,psimd_f32 b)1160 PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) { 1161 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); 1162 } 1163 psimd_concat_odd_f32(psimd_f32 a,psimd_f32 b)1164 PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) { 1165 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); 1166 } 1167 #endif 1168 1169 /* Vector reduce */ 1170 #if defined(__clang__) psimd_allreduce_sum_f32(psimd_f32 v)1171 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) { 1172 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1); 1173 return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2); 1174 } 1175 psimd_allreduce_max_f32(psimd_f32 v)1176 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) { 1177 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1)); 1178 return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2)); 1179 } 1180 psimd_allreduce_min_f32(psimd_f32 v)1181 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) { 1182 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1)); 1183 return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2)); 1184 } 1185 psimd_reduce_sum_f32(psimd_f32 v)1186 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) { 1187 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1); 1188 const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1); 1189 return result[0]; 1190 } 1191 psimd_reduce_max_f32(psimd_f32 v)1192 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) { 1193 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1)); 1194 const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1)); 1195 return result[0]; 1196 } 1197 psimd_reduce_min_f32(psimd_f32 v)1198 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) { 1199 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1)); 1200 const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1)); 1201 return result[0]; 1202 } 1203 #else psimd_allreduce_sum_f32(psimd_f32 v)1204 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) { 1205 const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }); 1206 return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }); 1207 } 1208 psimd_allreduce_max_f32(psimd_f32 v)1209 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) { 1210 const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 })); 1211 return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 })); 1212 } 1213 psimd_allreduce_min_f32(psimd_f32 v)1214 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) { 1215 const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 })); 1216 return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 })); 1217 } 1218 psimd_reduce_sum_f32(psimd_f32 v)1219 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) { 1220 const psimd_f32 result = psimd_allreduce_sum_f32(v); 1221 return result[0]; 1222 } 1223 psimd_reduce_max_f32(psimd_f32 v)1224 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) { 1225 const psimd_f32 result = psimd_allreduce_max_f32(v); 1226 return result[0]; 1227 } 1228 psimd_reduce_min_f32(psimd_f32 v)1229 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) { 1230 const psimd_f32 result = psimd_allreduce_min_f32(v); 1231 return result[0]; 1232 } 1233 #endif 1234 #endif 1235 1236 #endif /* PSIMD_H */ 1237