1 // Copyright 2008-present Contributors to the OpenImageIO project. 2 // SPDX-License-Identifier: BSD-3-Clause 3 // https://github.com/OpenImageIO/oiio/blob/master/LICENSE.md 4 5 /// @file simd.h 6 /// 7 /// @brief Classes for SIMD processing. 8 /// 9 /// Nice references for all the Intel intrinsics (SSE*, AVX*, etc.): 10 /// https://software.intel.com/sites/landingpage/IntrinsicsGuide/ 11 /// 12 /// Similar guide for ARM intrinsics: 13 /// https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics 14 /// 15 /// It helped me a lot to peruse the source of these packages: 16 /// Syrah: https://github.com/boulos/syrah 17 /// Embree: https://github.com/embree 18 /// Vectorial: https://github.com/scoopr/vectorial 19 /// 20 /// To find out which CPU features you have: 21 /// Linux: cat /proc/cpuinfo 22 /// OSX: sysctl machdep.cpu.features 23 /// 24 /// Additional web resources: 25 /// http://www.codersnotes.com/notes/maths-lib-2016/ 26 27 // clang-format off 28 29 #pragma once 30 31 #include <algorithm> 32 #include <cstring> 33 34 #include <OpenImageIO/Imath.h> 35 #include <OpenImageIO/dassert.h> 36 #include <OpenImageIO/platform.h> 37 38 39 ////////////////////////////////////////////////////////////////////////// 40 // Sort out which SIMD capabilities we have and set definitions 41 // appropriately. This is mostly for internal (within this file) use, 42 // but client applications using this header may find a few of the macros 43 // we define to be useful: 44 // 45 // OIIO_SIMD : Will be 0 if no hardware SIMD support is specified. If SIMD 46 // hardware is available, this will hold the width in number of 47 // float SIMD "lanes" of widest SIMD registers available. For 48 // example, OIIO_SIMD will be 4 if vfloat4/vint4/vbool4 are 49 // hardware accelerated, 8 if vfloat8/vint8/vbool8 are accelerated, 50 // etc. Using SIMD classes wider than this should work (will be 51 // emulated with narrower SIMD or scalar operations), but is not 52 // expected to have high performance. 53 // OIIO_SIMD_SSE : if Intel SSE is supported, this will be nonzero, 54 // specifically 2 for SSE2, 3 for SSSE3, 4 for SSE4.1 or 55 // higher (including AVX). 56 // OIIO_SIMD_AVX : If Intel AVX is supported, this will be nonzero, and 57 // specifically 1 for AVX (1.0), 2 for AVX2, 512 for AVX512f. 58 // OIIO_SIMD_NEON : If ARM NEON is supported, this will be nonzero. 59 // OIIO_SIMD_MAX_SIZE : holds the width in bytes of the widest SIMD 60 // available (generally will be OIIO_SIMD*4). 61 // OIIO_SIMD4_ALIGN : macro for best alignment of 4-wide SIMD values in mem. 62 // OIIO_SIMD8_ALIGN : macro for best alignment of 8-wide SIMD values in mem. 63 // OIIO_SIMD16_ALIGN : macro for best alignment of 16-wide SIMD values in mem. 64 // OIIO_SIMD_HAS_MATRIX4 : nonzero if matrix44 is defined 65 // OIIO_SIMD_HAS_SIMD8 : nonzero if vfloat8, vint8, vbool8 are defined 66 // OIIO_SIMD_HAS_SIMD16 : nonzero if vfloat16, vint16, vbool16 are defined 67 68 #if defined(_WIN32) 69 # include <intrin.h> 70 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || defined(__e2k__) 71 # include <x86intrin.h> 72 #elif defined(__GNUC__) && defined(__ARM_NEON__) 73 # include <arm_neon.h> 74 #endif 75 76 // Disable SSE for 32 bit Windows patforms, it's unreliable and hard for us 77 // to test thoroughly. We presume that anybody needing high performance 78 // badly enough to want SIMD also is on a 64 bit CPU. 79 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE) 80 #define OIIO_NO_SSE 1 81 #endif 82 83 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE) 84 # if (defined(__SSE4_1__) || defined(__SSE4_2__)) 85 # define OIIO_SIMD_SSE 4 86 /* N.B. We consider both SSE4.1 and SSE4.2 to be "4". There are a few 87 * instructions specific to 4.2, but they are all related to string 88 * comparisons and CRCs, which don't currently seem relevant to OIIO, 89 * so for simplicity, we sweep this difference under the rug. 90 */ 91 # elif defined(__SSSE3__) 92 # define OIIO_SIMD_SSE 3 93 /* N.B. We only use OIIO_SIMD_SSE = 3 when fully at SSSE3. In theory, 94 * there are a few older architectures that are SSE3 but not SSSE3, 95 * and this simplification means that these particular old platforms 96 * will only get SSE2 goodness out of our code. So be it. Anybody who 97 * cares about performance is probably using a 64 bit machine that's 98 * SSE 4.x or AVX by now. 99 */ 100 # else 101 # define OIIO_SIMD_SSE 2 102 # endif 103 # define OIIO_SIMD 4 104 # define OIIO_SIMD_MAX_SIZE_BYTES 16 105 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16) 106 # define OIIO_SSE_ALIGN OIIO_ALIGN(16) 107 #else 108 # define OIIO_SIMD_SSE 0 109 #endif 110 111 #if defined(__AVX__) && !defined(OIIO_NO_AVX) 112 // N.B. Any machine with AVX will also have SSE 113 # if defined(__AVX2__) && !defined(OIIO_NO_AVX2) 114 # define OIIO_SIMD_AVX 2 115 # else 116 # define OIIO_SIMD_AVX 1 117 # endif 118 # undef OIIO_SIMD 119 # define OIIO_SIMD 8 120 # undef OIIO_SIMD_MAX_SIZE_BYTES 121 # define OIIO_SIMD_MAX_SIZE_BYTES 32 122 # define OIIO_SIMD8_ALIGN OIIO_ALIGN(32) 123 # define OIIO_AVX_ALIGN OIIO_ALIGN(32) 124 # if defined(__AVX512F__) 125 # undef OIIO_SIMD_AVX 126 # define OIIO_SIMD_AVX 512 127 # undef OIIO_SIMD_MAX_SIZE_BYTES 128 # define OIIO_SIMD_MAX_SIZE_BYTES 64 129 # undef OIIO_SIMD 130 # define OIIO_SIMD 16 131 # define OIIO_SIMD16_ALIGN OIIO_ALIGN(64) 132 # define OIIO_AVX512_ALIGN OIIO_ALIGN(64) 133 # define OIIO_AVX512F_ENABLED 1 134 # endif 135 # if defined(__AVX512DQ__) 136 # define OIIO_AVX512DQ_ENABLED 1 /* Doubleword and quadword */ 137 # else 138 # define OIIO_AVX512DQ_ENABLED 0 139 # endif 140 # if defined(__AVX512PF__) 141 # define OIIO_AVX512PF_ENABLED 1 /* Prefetch */ 142 # else 143 # define OIIO_AVX512PF_ENABLED 0 144 # endif 145 # if defined(__AVX512ER__) 146 # define OIIO_AVX512ER_ENABLED 1 /* Exponential & reciprocal */ 147 # else 148 # define OIIO_AVX512ER_ENABLED 0 149 # endif 150 # if defined(__AVX512CD__) 151 # define OIIO_AVX512CD_ENABLED 1 /* Conflict detection */ 152 # else 153 # define OIIO_AVX512CD_ENABLED 0 154 # endif 155 # if defined(__AVX512BW__) 156 # define OIIO_AVX512BW_ENABLED 1 /* Byte and word */ 157 # else 158 # define OIIO_AVX512BW_ENABLED 0 159 # endif 160 # if defined(__AVX512VL__) 161 # define OIIO_AVX512VL_ENABLED 1 /* Vector length extensions */ 162 # else 163 # define OIIO_AVX512VL_ENABLED 0 164 # endif 165 #else 166 # define OIIO_SIMD_AVX 0 167 # define OIIO_AVX512VL_ENABLED 0 168 # define OIIO_AVX512DQ_ENABLED 0 169 # define OIIO_AVX512PF_ENABLED 0 170 # define OIIO_AVX512ER_ENABLED 0 171 # define OIIO_AVX512CD_ENABLED 0 172 # define OIIO_AVX512BW_ENABLED 0 173 #endif 174 175 #if defined(__FMA__) 176 # define OIIO_FMA_ENABLED 1 177 #else 178 # define OIIO_FMA_ENABLED 0 179 #endif 180 #if defined(__AVX512IFMA__) 181 # define OIIO_AVX512IFMA_ENABLED 1 182 #else 183 # define OIIO_AVX512IFMA_ENABLED 0 184 #endif 185 186 #if defined(__F16C__) 187 # define OIIO_F16C_ENABLED 1 188 #else 189 # define OIIO_F16C_ENABLED 0 190 #endif 191 192 // FIXME Future: support ARM Neon 193 // Uncomment this when somebody with Neon can verify it works 194 #if defined(__ARM_NEON__) && !defined(OIIO_NO_NEON) 195 # define OIIO_SIMD 4 196 # define OIIO_SIMD_NEON 1 197 # define OIIO_SIMD_MAX_SIZE_BYTES 16 198 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16) 199 # define OIIO_SSE_ALIGN OIIO_ALIGN(16) 200 #else 201 # define OIIO_SIMD_NEON 0 202 #endif 203 204 #ifndef OIIO_SIMD 205 // No SIMD available 206 # define OIIO_SIMD 0 207 # define OIIO_SIMD4_ALIGN 208 # define OIIO_SIMD_MAX_SIZE_BYTES 16 209 #endif 210 211 #ifndef OIIO_SIMD8_ALIGN 212 # define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN 213 #endif 214 #ifndef OIIO_SIMD16_ALIGN 215 # define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN 216 #endif 217 218 219 // General features that client apps may want to test for, for conditional 220 // compilation. Will add to this over time as needed. Note that just 221 // because a feature is present doesn't mean it's fast -- HAS_SIMD8 means 222 // the vfloat8 class (and friends) are in this version of simd.h, but that's 223 // different from OIIO_SIMD >= 8, which means it's supported in hardware. 224 #define OIIO_SIMD_HAS_MATRIX4 1 /* matrix44 defined */ 225 #define OIIO_SIMD_HAS_FLOAT8 1 /* DEPRECATED(1.8) */ 226 #define OIIO_SIMD_HAS_SIMD8 1 /* vfloat8, vint8, vbool8 defined */ 227 #define OIIO_SIMD_HAS_SIMD16 1 /* vfloat16, vint16, vbool16 defined */ 228 229 230 // Embarrassing hack: Xlib.h #define's True and False! 231 #ifdef True 232 # undef True 233 #endif 234 #ifdef False 235 # undef False 236 #endif 237 238 239 240 OIIO_NAMESPACE_BEGIN 241 242 namespace simd { 243 244 ////////////////////////////////////////////////////////////////////////// 245 // Forward declarations of our main SIMD classes 246 247 class vbool4; 248 class vint4; 249 class vfloat4; 250 class vfloat3; 251 class matrix44; 252 class vbool8; 253 class vint8; 254 class vfloat8; 255 class vbool16; 256 class vint16; 257 class vfloat16; 258 259 // Deprecated names -- remove these in 1.9 260 typedef vbool4 mask4; // old name 261 typedef vbool4 bool4; 262 typedef vbool8 bool8; 263 typedef vint4 int4; 264 typedef vint8 int8; 265 typedef vfloat3 float3; 266 typedef vfloat4 float4; 267 typedef vfloat8 float8; 268 269 270 271 ////////////////////////////////////////////////////////////////////////// 272 // Template magic to determine the raw SIMD types involved, and other 273 // things helpful for metaprogramming. 274 275 template <typename T, int N> struct simd_raw_t { struct type { T val[N]; }; }; 276 template <int N> struct simd_bool_t { struct type { int val[N]; }; }; 277 278 #if OIIO_SIMD_SSE 279 template<> struct simd_raw_t<int,4> { typedef __m128i type; }; 280 template<> struct simd_raw_t<float,4> { typedef __m128 type; }; 281 template<> struct simd_bool_t<4> { typedef __m128 type; }; 282 #endif 283 284 #if OIIO_SIMD_AVX 285 template<> struct simd_raw_t<int,8> { typedef __m256i type; }; 286 template<> struct simd_raw_t<float,8> { typedef __m256 type; }; 287 template<> struct simd_bool_t<8> { typedef __m256 type; }; 288 #endif 289 290 #if OIIO_SIMD_AVX >= 512 291 template<> struct simd_raw_t<int,16> { typedef __m512i type; }; 292 template<> struct simd_raw_t<float,16> { typedef __m512 type; }; 293 template<> struct simd_bool_t<16> { typedef __mmask16 type; }; 294 #else 295 // Note: change in strategy for 16-wide SIMD: instead of int[16] for 296 // vbool16, it's just a plain old bitmask, and __mask16 for actual HW. 297 template<> struct simd_bool_t<16> { typedef uint16_t type; }; 298 #endif 299 300 #if OIIO_SIMD_NEON 301 template<> struct simd_raw_t<int,4> { typedef int32x4_t type; }; 302 template<> struct simd_raw_t<float,4> { typedef float32x4_t type; }; 303 template<> struct simd_bool_t<4> { typedef uint32x4_t type; }; 304 #endif 305 306 307 /// Template to retrieve the vector type from the scalar. For example, 308 /// simd::VecType<int,4> will be vfloat4. 309 template<typename T,int elements> struct VecType {}; 310 template<> struct VecType<int,1> { typedef int type; }; 311 template<> struct VecType<float,1> { typedef float type; }; 312 template<> struct VecType<int,4> { typedef vint4 type; }; 313 template<> struct VecType<float,4> { typedef vfloat4 type; }; 314 template<> struct VecType<float,3> { typedef vfloat3 type; }; 315 template<> struct VecType<bool,4> { typedef vbool4 type; }; 316 template<> struct VecType<int,8> { typedef vint8 type; }; 317 template<> struct VecType<float,8> { typedef vfloat8 type; }; 318 template<> struct VecType<bool,8> { typedef vbool8 type; }; 319 template<> struct VecType<int,16> { typedef vint16 type; }; 320 template<> struct VecType<float,16> { typedef vfloat16 type; }; 321 template<> struct VecType<bool,16> { typedef vbool16 type; }; 322 323 /// Template to retrieve the SIMD size of a SIMD type. Rigged to be 1 for 324 /// anything but our SIMD types. 325 template<typename T> struct SimdSize { static const int size = 1; }; 326 template<> struct SimdSize<vint4> { static const int size = 4; }; 327 template<> struct SimdSize<vfloat4> { static const int size = 4; }; 328 template<> struct SimdSize<vfloat3> { static const int size = 4; }; 329 template<> struct SimdSize<vbool4> { static const int size = 4; }; 330 template<> struct SimdSize<vint8> { static const int size = 8; }; 331 template<> struct SimdSize<vfloat8> { static const int size = 8; }; 332 template<> struct SimdSize<vbool8> { static const int size = 8; }; 333 template<> struct SimdSize<vint16> { static const int size = 16; }; 334 template<> struct SimdSize<vfloat16> { static const int size = 16; }; 335 template<> struct SimdSize<vbool16> { static const int size = 16; }; 336 337 /// Template to retrieve the number of elements size of a SIMD type. Rigged 338 /// to be 1 for anything but our SIMD types. 339 template<typename T> struct SimdElements { static const int size = SimdSize<T>::size; }; 340 template<> struct SimdElements<vfloat3> { static const int size = 3; }; 341 342 /// Template giving a printable name for each type 343 template<typename T> struct SimdTypeName { static const char *name() { return "unknown"; } }; 344 template<> struct SimdTypeName<vfloat4> { static const char *name() { return "vfloat4"; } }; 345 template<> struct SimdTypeName<vint4> { static const char *name() { return "vint4"; } }; 346 template<> struct SimdTypeName<vbool4> { static const char *name() { return "vbool4"; } }; 347 template<> struct SimdTypeName<vfloat8> { static const char *name() { return "vfloat8"; } }; 348 template<> struct SimdTypeName<vint8> { static const char *name() { return "vint8"; } }; 349 template<> struct SimdTypeName<vbool8> { static const char *name() { return "vbool8"; } }; 350 template<> struct SimdTypeName<vfloat16> { static const char *name() { return "vfloat16"; } }; 351 template<> struct SimdTypeName<vint16> { static const char *name() { return "vint16"; } }; 352 template<> struct SimdTypeName<vbool16> { static const char *name() { return "vbool16"; } }; 353 354 355 ////////////////////////////////////////////////////////////////////////// 356 // Macros helpful for making static constants in code. 357 358 # define OIIO_SIMD_FLOAT4_CONST(name,val) \ 359 static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) } 360 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \ 361 static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) } 362 # define OIIO_SIMD_INT4_CONST(name,val) \ 363 static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) } 364 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \ 365 static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) } 366 # define OIIO_SIMD_UINT4_CONST(name,val) \ 367 static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) } 368 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \ 369 static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) } 370 371 # define OIIO_SIMD_FLOAT8_CONST(name,val) \ 372 static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \ 373 (val), (val), (val), (val) } 374 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \ 375 static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \ 376 (v4), (v5), (v6), (v7) } 377 # define OIIO_SIMD_INT8_CONST(name,val) \ 378 static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \ 379 (val), (val), (val), (val) } 380 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \ 381 static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \ 382 (v4), (v5), (v6), (v7) } 383 # define OIIO_SIMD_UINT8_CONST(name,val) \ 384 static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \ 385 (val), (val), (val), (val) } 386 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \ 387 static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \ 388 (v4), (v5), (v6), (v7) } 389 390 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \ 391 static const OIIO_SIMD16_ALIGN float name[16] = { \ 392 (val), (val), (val), (val), (val), (val), (val), (val), \ 393 (val), (val), (val), (val), (val), (val), (val), (val) } 394 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \ 395 static const OIIO_SIMD16_ALIGN float name[16] = { \ 396 (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \ 397 (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) } 398 # define OIIO_SIMD_INT16_CONST(name,val) \ 399 static const OIIO_SIMD16_ALIGN int name[16] = { \ 400 (val), (val), (val), (val), (val), (val), (val), (val), \ 401 (val), (val), (val), (val), (val), (val), (val), (val) } 402 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \ 403 static const OIIO_SIMD16_ALIGN int name[16] = { \ 404 (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \ 405 (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) } 406 # define OIIO_SIMD_UINT16_CONST(name,val) \ 407 static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \ 408 (val), (val), (val), (val), (val), (val), (val), (val), \ 409 (val), (val), (val), (val), (val), (val), (val), (val) } 410 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \ 411 static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \ 412 (val), (val), (val), (val), (val), (val), (val), (val), \ 413 (val), (val), (val), (val), (val), (val), (val), (val) } 414 415 416 ////////////////////////////////////////////////////////////////////////// 417 // Some macros just for use in this file (#undef-ed at the end) making 418 // it more succinct to express per-element operations. 419 420 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x 421 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x) 422 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \ 423 for (int i = elements; i < paddedelements; ++i) m_val[i] = 0 424 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r 425 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r 426 427 428 429 ////////////////////////////////////////////////////////////////////////// 430 ////////////////////////////////////////////////////////////////////////// 431 // The public declarations of the main SIMD classes follow: boolN, intN, 432 // floatN, matrix44. 433 // 434 // These class declarations are intended to be brief and self-documenting, 435 // and give all the information that users or client applications need to 436 // know to use these classes. 437 // 438 // No implementations are given inline except for the briefest, completely 439 // generic methods that don't have any architecture-specific overloads. 440 // After the class defintions, there will be an immense pile of full 441 // implementation definitions, which casual users are not expected to 442 // understand. 443 ////////////////////////////////////////////////////////////////////////// 444 ////////////////////////////////////////////////////////////////////////// 445 446 447 /// vbool4: An 4-vector whose elements act mostly like bools, accelerated by 448 /// SIMD instructions when available. This is what is naturally produced by 449 /// SIMD comparison operators on the vfloat4 and vint4 types. 450 class vbool4 { 451 public: 452 static const char* type_name() { return "vbool4"; } 453 typedef bool value_t; ///< Underlying equivalent scalar value type 454 enum { elements = 4 }; ///< Number of scalar elements 455 enum { paddedelements = 4 }; ///< Number of scalar elements for full pad 456 enum { bits = elements*32 }; ///< Total number of bits 457 typedef simd_bool_t<4>::type simd_t; ///< the native SIMD type used 458 459 /// Default constructor (contents undefined) 460 vbool4 () { } 461 462 /// Construct from a single value (store it in all slots) 463 vbool4 (bool a) { load(a); } 464 465 explicit vbool4 (const bool *a); 466 467 /// Construct from 4 bool values 468 vbool4 (bool a, bool b, bool c, bool d) { load (a, b, c, d); } 469 470 /// Copy construct from another vbool4 471 vbool4 (const vbool4 &other) { m_simd = other.m_simd; } 472 473 /// Construct from 4 int values 474 vbool4 (int a, int b, int c, int d) { 475 load (bool(a), bool(b), bool(c), bool(d)); 476 } 477 478 /// Construct from a SIMD int (is each element nonzero?) 479 vbool4 (const vint4 &i); 480 481 /// Construct from the underlying SIMD type 482 vbool4 (const simd_t& m) : m_simd(m) { } 483 484 /// Return the raw SIMD type 485 operator simd_t () const { return m_simd; } 486 simd_t simd () const { return m_simd; } 487 simd_t& simd () { return m_simd; } 488 489 /// Extract the bitmask 490 int bitmask () const; 491 492 /// Convert from integer bitmask to a true vbool4 493 static vbool4 from_bitmask (int bitmask); 494 495 /// Set all components to false 496 void clear (); 497 498 /// Return a vbool4 the is 'false' for all values 499 static const vbool4 False (); 500 501 /// Return a vbool4 the is 'true' for all values 502 static const vbool4 True (); 503 504 /// Assign one value to all components 505 const vbool4 & operator= (bool a) { load(a); return *this; } 506 507 /// Assignment of another vbool4 508 const vbool4 & operator= (const vbool4 & other); 509 510 /// Component access (get) 511 int operator[] (int i) const; 512 513 /// Component access (set). 514 void setcomp (int i, bool value); 515 516 /// Component access (set). 517 /// NOTE: avoid this unsafe construct. It will go away some day. 518 int& operator[] (int i); 519 520 /// Helper: load a single value into all components. 521 void load (bool a); 522 523 /// Helper: load separate values into each component. 524 void load (bool a, bool b, bool c, bool d); 525 526 /// Helper: store the values into memory as bools. 527 void store (bool *values) const; 528 529 /// Store the first n values into memory. 530 void store (bool *values, int n) const; 531 532 /// Logical/bitwise operators, component-by-component 533 friend vbool4 operator! (const vbool4& a); 534 friend vbool4 operator& (const vbool4& a, const vbool4& b); 535 friend vbool4 operator| (const vbool4& a, const vbool4& b); 536 friend vbool4 operator^ (const vbool4& a, const vbool4& b); 537 friend vbool4 operator~ (const vbool4& a); 538 friend const vbool4& operator&= (vbool4& a, const vbool4& b); 539 friend const vbool4& operator|= (vbool4& a, const vbool4& b); 540 friend const vbool4& operator^= (vbool4& a, const vbool4& b); 541 542 /// Comparison operators, component by component 543 friend vbool4 operator== (const vbool4& a, const vbool4& b); 544 friend vbool4 operator!= (const vbool4& a, const vbool4& b); 545 546 /// Stream output 547 friend std::ostream& operator<< (std::ostream& cout, const vbool4 & a); 548 549 private: 550 // The actual data representation 551 union { 552 simd_t m_simd; 553 int m_val[paddedelements]; 554 }; 555 }; 556 557 558 559 /// Helper: shuffle/swizzle with constant (templated) indices. 560 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c) 561 template<int i0, int i1, int i2, int i3> vbool4 shuffle (const vbool4& a); 562 563 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a) 564 template<int i> vbool4 shuffle (const vbool4& a); 565 566 /// Helper: as rapid as possible extraction of one component, when the 567 /// index is fixed. 568 template<int i> bool extract (const vbool4& a); 569 570 /// Helper: substitute val for a[i] 571 template<int i> vbool4 insert (const vbool4& a, bool val); 572 573 /// Logical reduction across all components. 574 bool reduce_and (const vbool4& v); 575 bool reduce_or (const vbool4& v); 576 577 // Are all/any/no components true? 578 bool all (const vbool4& v); 579 bool any (const vbool4& v); 580 bool none (const vbool4& v); 581 582 // It's handy to have this defined for regular bool as well 583 inline bool all (bool v) { return v; } 584 585 586 587 /// vbool8: An 8-vector whose elements act mostly like bools, accelerated by 588 /// SIMD instructions when available. This is what is naturally produced by 589 /// SIMD comparison operators on the vfloat8 and vint8 types. 590 class vbool8 { 591 public: 592 static const char* type_name() { return "vbool8"; } 593 typedef bool value_t; ///< Underlying equivalent scalar value type 594 enum { elements = 8 }; ///< Number of scalar elements 595 enum { paddedelements = 8 }; ///< Number of scalar elements for full pad 596 enum { bits = elements*32 }; ///< Total number of bits 597 typedef simd_bool_t<8>::type simd_t; ///< the native SIMD type used 598 599 /// Default constructor (contents undefined) 600 vbool8 () { } 601 602 /// Construct from a single value (store it in all slots) 603 vbool8 (bool a) { load (a); } 604 605 explicit vbool8 (const bool *values); 606 607 /// Construct from 8 bool values 608 vbool8 (bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h); 609 610 /// Copy construct from another vbool8 611 vbool8 (const vbool8 &other) { m_simd = other.m_simd; } 612 613 /// Construct from 8 int values 614 vbool8 (int a, int b, int c, int d, int e, int f, int g, int h); 615 616 /// Construct from a SIMD int (is each element nonzero?) 617 vbool8 (const vint8 &i); 618 619 /// Construct from two vbool4's 620 vbool8 (const vbool4 &lo, const vbool4 &hi); 621 622 /// Construct from the underlying SIMD type 623 vbool8 (const simd_t& m) : m_simd(m) { } 624 625 /// Return the raw SIMD type 626 operator simd_t () const { return m_simd; } 627 simd_t simd () const { return m_simd; } 628 simd_t& simd () { return m_simd; } 629 630 /// Extract the bitmask 631 int bitmask () const; 632 633 /// Convert from integer bitmask to a true vbool8 634 static vbool8 from_bitmask (int bitmask); 635 636 /// Set all components to false 637 void clear (); 638 639 /// Return a vbool8 the is 'false' for all values 640 static const vbool8 False (); 641 642 /// Return a vbool8 the is 'true' for all values 643 static const vbool8 True (); 644 645 /// Assign one value to all components 646 const vbool8 & operator= (bool a); 647 648 /// Assignment of another vbool8 649 const vbool8 & operator= (const vbool8 & other); 650 651 /// Component access (get) 652 int operator[] (int i) const; 653 654 /// Component access (set). 655 void setcomp (int i, bool value); 656 657 /// Component access (set). 658 /// NOTE: avoid this unsafe construct. It will go away some day. 659 int& operator[] (int i); 660 661 /// Extract the lower precision vbool4 662 vbool4 lo () const; 663 664 /// Extract the higher precision vbool4 665 vbool4 hi () const; 666 667 /// Helper: load a single value into all components. 668 void load (bool a); 669 670 /// Helper: load separate values into each component. 671 void load (bool a, bool b, bool c, bool d, 672 bool e, bool f, bool g, bool h); 673 674 /// Helper: store the values into memory as bools. 675 void store (bool *values) const; 676 677 /// Store the first n values into memory. 678 void store (bool *values, int n) const; 679 680 /// Logical/bitwise operators, component-by-component 681 friend vbool8 operator! (const vbool8& a); 682 friend vbool8 operator& (const vbool8& a, const vbool8& b); 683 friend vbool8 operator| (const vbool8& a, const vbool8& b); 684 friend vbool8 operator^ (const vbool8& a, const vbool8& b); 685 friend vbool8 operator~ (const vbool8& a); 686 friend const vbool8& operator&= (vbool8& a, const vbool8& b); 687 friend const vbool8& operator|= (vbool8& a, const vbool8& b); 688 friend const vbool8& operator^= (vbool8& a, const vbool8& b); 689 690 /// Comparison operators, component by component 691 friend vbool8 operator== (const vbool8& a, const vbool8& b); 692 friend vbool8 operator!= (const vbool8& a, const vbool8& b); 693 694 /// Stream output 695 friend std::ostream& operator<< (std::ostream& cout, const vbool8 & a); 696 697 private: 698 // The actual data representation 699 union { 700 simd_t m_simd; 701 int m_val[paddedelements]; 702 vbool4 m_4[2]; 703 }; 704 }; 705 706 707 708 /// Helper: shuffle/swizzle with constant (templated) indices. 709 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c) 710 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 711 vbool8 shuffle (const vbool8& a); 712 713 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a) 714 template<int i> vbool8 shuffle (const vbool8& a); 715 716 /// Helper: as rapid as possible extraction of one component, when the 717 /// index is fixed. 718 template<int i> bool extract (const vbool8& a); 719 720 /// Helper: substitute val for a[i] 721 template<int i> vbool8 insert (const vbool8& a, bool val); 722 723 /// Logical reduction across all components. 724 bool reduce_and (const vbool8& v); 725 bool reduce_or (const vbool8& v); 726 727 // Are all/any/no components true? 728 bool all (const vbool8& v); 729 bool any (const vbool8& v); 730 bool none (const vbool8& v); 731 732 733 734 735 /// vbool16: An 16-vector whose elements act mostly like bools, accelerated 736 /// by SIMD instructions when available. This is what is naturally produced 737 /// by SIMD comparison operators on the vfloat16 and vint16 types. 738 class vbool16 { 739 public: 740 static const char* type_name() { return "vbool16"; } 741 typedef bool value_t; ///< Underlying equivalent scalar value type 742 enum { elements = 16 }; ///< Number of scalar elements 743 enum { paddedelements = 16 }; ///< Number of scalar elements for full pad 744 enum { bits = 16 }; ///< Total number of bits 745 typedef simd_bool_t<16>::type simd_t; ///< the native SIMD type used 746 747 /// Default constructor (contents undefined) 748 vbool16 () { } 749 750 /// Construct from a single value (store it in all slots) 751 vbool16 (bool a) { load (a); } 752 753 explicit vbool16 (int bitmask) { load_bitmask (bitmask); } 754 755 explicit vbool16 (const bool *values); 756 757 /// Construct from 16 bool values 758 vbool16 (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7, 759 bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15); 760 761 /// Copy construct from another vbool16 762 vbool16 (const vbool16 &other) { m_simd = other.m_simd; } 763 764 /// Construct from 16 int values 765 vbool16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7, 766 int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15); 767 768 /// Construct from a SIMD int (is each element nonzero?) 769 vbool16 (const vint16 &i); 770 771 /// Construct from two vbool8's 772 vbool16 (const vbool8 &lo, const vbool8 &hi); 773 774 /// Construct from four vbool4's 775 vbool16 (const vbool4 &b4a, const vbool4 &b4b, const vbool4 &b4c, const vbool4 &b4d); 776 777 /// Construct from the underlying SIMD type 778 vbool16 (const simd_t& m) : m_simd(m) { } 779 780 /// Return the raw SIMD type 781 operator simd_t () const { return m_simd; } 782 simd_t simd () const { return m_simd; } 783 simd_t& simd () { return m_simd; } 784 785 int bitmask () const; 786 787 /// Convert from integer bitmask to a true vbool16 788 static vbool16 from_bitmask (int bitmask) { return vbool16(bitmask); } 789 790 /// Set all components to false 791 void clear (); 792 793 /// Return a vbool16 the is 'false' for all values 794 static const vbool16 False (); 795 796 /// Return a vbool16 the is 'true' for all values 797 static const vbool16 True (); 798 799 /// Assign one value to all components 800 const vbool16 & operator= (bool a); 801 802 /// Assignment of another vbool16 803 const vbool16 & operator= (const vbool16 & other); 804 805 /// Component access (get) 806 int operator[] (int i) const; 807 808 /// Component access (set). 809 void setcomp (int i, bool value); 810 811 /// Extract the lower precision vbool8 812 vbool8 lo () const; 813 814 /// Extract the higher precision vbool8 815 vbool8 hi () const; 816 817 /// Helper: load a single value into all components. 818 void load (bool a); 819 820 /// Helper: load separate values into each component. 821 void load (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7, 822 bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15); 823 824 /// Helper: load all components from a bitmask in an int. 825 void load_bitmask (int a); 826 827 /// Helper: store the values into memory as bools. 828 void store (bool *values) const; 829 830 /// Store the first n values into memory. 831 void store (bool *values, int n) const; 832 833 /// Logical/bitwise operators, component-by-component 834 friend vbool4 operator! (const vbool4& a); 835 friend vbool16 operator! (const vbool16& a); 836 friend vbool16 operator& (const vbool16& a, const vbool16& b); 837 friend vbool16 operator| (const vbool16& a, const vbool16& b); 838 friend vbool16 operator^ (const vbool16& a, const vbool16& b); 839 friend vbool16 operator~ (const vbool16& a); 840 friend const vbool16& operator&= (vbool16& a, const vbool16& b); 841 friend const vbool16& operator|= (vbool16& a, const vbool16& b); 842 friend const vbool16& operator^= (vbool16& a, const vbool16& b); 843 844 /// Comparison operators, component by component 845 friend vbool16 operator== (const vbool16& a, const vbool16& b); 846 friend vbool16 operator!= (const vbool16& a, const vbool16& b); 847 848 /// Stream output 849 friend std::ostream& operator<< (std::ostream& cout, const vbool16 & a); 850 851 private: 852 // The actual data representation 853 union { 854 simd_t m_simd; 855 uint16_t m_bits; 856 }; 857 }; 858 859 860 861 /// Helper: as rapid as possible extraction of one component, when the 862 /// index is fixed. 863 template<int i> bool extract (const vbool16& a); 864 865 /// Helper: substitute val for a[i] 866 template<int i> vbool16 insert (const vbool16& a, bool val); 867 868 /// Logical reduction across all components. 869 bool reduce_and (const vbool16& v); 870 bool reduce_or (const vbool16& v); 871 872 // Are all/any/no components true? 873 bool all (const vbool16& v); 874 bool any (const vbool16& v); 875 bool none (const vbool16& v); 876 877 878 879 880 881 /// Integer 4-vector, accelerated by SIMD instructions when available. 882 class vint4 { 883 public: 884 static const char* type_name() { return "vint4"; } 885 typedef int value_t; ///< Underlying equivalent scalar value type 886 enum { elements = 4 }; ///< Number of scalar elements 887 enum { paddedelements =4 }; ///< Number of scalar elements for full pad 888 enum { bits = 128 }; ///< Total number of bits 889 typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used 890 typedef vbool4 vbool_t; ///< bool type of the same length 891 typedef vfloat4 vfloat_t; ///< float type of the same length 892 typedef vint4 vint_t; ///< int type of the same length 893 typedef vbool4 bool_t; // old name (deprecated 1.8) 894 typedef vfloat4 float_t; // old name (deprecated 1.8) 895 896 /// Default constructor (contents undefined) 897 vint4 () { } 898 899 /// Construct from a single value (store it in all slots) 900 vint4 (int a); 901 902 /// Construct from 2 values -- (a,a,b,b) 903 vint4 (int a, int b); 904 905 /// Construct from 4 values 906 vint4 (int a, int b, int c, int d); 907 908 /// Construct from a pointer to values 909 vint4 (const int *vals); 910 911 /// Construct from a pointer to unsigned short values 912 explicit vint4 (const unsigned short *vals); 913 914 /// Construct from a pointer to signed short values 915 explicit vint4 (const short *vals); 916 917 /// Construct from a pointer to unsigned char values (0 - 255) 918 explicit vint4 (const unsigned char *vals); 919 920 /// Construct from a pointer to signed char values (-128 - 127) 921 explicit vint4 (const char *vals); 922 923 /// Copy construct from another vint4 924 vint4 (const vint4 & other) { m_simd = other.m_simd; } 925 926 /// Convert a vfloat to an vint. Equivalent to i = (int)f; 927 explicit vint4 (const vfloat4& f); // implementation below 928 929 /// Construct from the underlying SIMD type 930 vint4 (const simd_t& m) : m_simd(m) { } 931 932 /// Return the raw SIMD type 933 operator simd_t () const { return m_simd; } 934 simd_t simd () const { return m_simd; } 935 simd_t& simd () { return m_simd; } 936 937 /// Return a pointer to the underlying scalar type 938 const value_t* data () const { return (const value_t*)this; } 939 value_t* data () { return (value_t*)this; } 940 941 /// Sset all components to 0 942 void clear () ; 943 944 /// Return an vint4 with all components set to 0 945 static const vint4 Zero (); 946 947 /// Return an vint4 with all components set to 1 948 static const vint4 One (); 949 950 /// Return an vint4 with all components set to -1 (aka 0xffffffff) 951 static const vint4 NegOne (); 952 953 /// Return an vint4 with incremented components (e.g., 0,1,2,3). 954 /// Optional arguments can give a non-zero starting point and step size. 955 static const vint4 Iota (int start=0, int step=1); 956 957 /// Return an vint4 with "geometric" iota: (1, 2, 4, 8). 958 static const vint4 Giota (); 959 960 /// Assign one value to all components. 961 const vint4 & operator= (int a); 962 963 /// Assignment from another vint4 964 const vint4 & operator= (const vint4& other) ; 965 966 /// Component access (get) 967 int operator[] (int i) const; 968 969 /// Component access (set) 970 int& operator[] (int i); 971 972 /// Component access (set). 973 void setcomp (int i, int value); 974 975 value_t x () const; 976 value_t y () const; 977 value_t z () const; 978 value_t w () const; 979 void set_x (value_t val); 980 void set_y (value_t val); 981 void set_z (value_t val); 982 void set_w (value_t val); 983 984 /// Helper: load a single int into all components 985 void load (int a); 986 987 /// Helper: load separate values into each component. 988 void load (int a, int b, int c, int d); 989 990 /// Load from an array of 4 values 991 void load (const int *values); 992 993 void load (const int *values, int n) ; 994 995 /// Load from an array of 4 unsigned short values, convert to vint4 996 void load (const unsigned short *values) ; 997 998 /// Load from an array of 4 unsigned short values, convert to vint4 999 void load (const short *values); 1000 1001 /// Load from an array of 4 unsigned char values, convert to vint4 1002 void load (const unsigned char *values); 1003 1004 /// Load from an array of 4 unsigned char values, convert to vint4 1005 void load (const char *values); 1006 1007 /// Store the values into memory 1008 void store (int *values) const; 1009 1010 /// Store the first n values into memory 1011 void store (int *values, int n) const; 1012 1013 /// Store the least significant 16 bits of each element into adjacent 1014 /// unsigned shorts. 1015 void store (unsigned short *values) const; 1016 1017 /// Store the least significant 8 bits of each element into adjacent 1018 /// unsigned chars. 1019 void store (unsigned char *values) const; 1020 1021 /// Masked load -- read from values[] where mask is 1, load zero where 1022 /// mask is 0. 1023 void load_mask (int mask, const value_t *values); 1024 void load_mask (const vbool_t& mask, const value_t *values); 1025 1026 /// Masked store -- write to values[] where mask is enabled, don't 1027 /// touch values[] where it's not. 1028 void store_mask (int mask, value_t *values) const; 1029 void store_mask (const vbool_t& mask, value_t *values) const; 1030 1031 /// Load values from addresses (char*)basepatr + vindex[i]*scale 1032 template<int scale=4> 1033 void gather (const value_t *baseptr, const vint_t& vindex); 1034 /// Gather elements defined by the mask, leave others unchanged. 1035 template<int scale=4> 1036 void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex); 1037 template<int scale=4> 1038 void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex); 1039 1040 /// Store values at addresses (char*)basepatr + vindex[i]*scale 1041 template<int scale=4> 1042 void scatter (value_t *baseptr, const vint_t& vindex) const; 1043 /// Scatter elements defined by the mask 1044 template<int scale=4> 1045 void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const; 1046 template<int scale=4> 1047 void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const; 1048 1049 // Arithmetic operators (component-by-component) 1050 friend vint4 operator+ (const vint4& a, const vint4& b); 1051 friend vint4 operator- (const vint4& a); 1052 friend vint4 operator- (const vint4& a, const vint4& b); 1053 friend vint4 operator* (const vint4& a, const vint4& b); 1054 friend vint4 operator/ (const vint4& a, const vint4& b); 1055 friend vint4 operator% (const vint4& a, const vint4& b); 1056 friend const vint4 & operator+= (vint4& a, const vint4& b); 1057 friend const vint4 & operator-= (vint4& a, const vint4& b); 1058 friend const vint4 & operator*= (vint4& a, const vint4& b); 1059 friend const vint4 & operator/= (vint4& a, const vint4& b); 1060 friend const vint4 & operator%= (vint4& a, const vint4& b); 1061 // Bitwise operators (component-by-component) 1062 friend vint4 operator& (const vint4& a, const vint4& b); 1063 friend vint4 operator| (const vint4& a, const vint4& b); 1064 friend vint4 operator^ (const vint4& a, const vint4& b); 1065 friend const vint4& operator&= (vint4& a, const vint4& b); 1066 friend const vint4& operator|= (vint4& a, const vint4& b); 1067 friend const vint4& operator^= (vint4& a, const vint4& b); 1068 friend vint4 operator~ (const vint4& a); 1069 friend vint4 operator<< (const vint4& a, unsigned int bits); 1070 friend vint4 operator>> (const vint4& a, unsigned int bits); 1071 friend const vint4& operator<<= (vint4& a, unsigned int bits); 1072 friend const vint4& operator>>= (vint4& a, unsigned int bits); 1073 // Comparison operators (component-by-component) 1074 friend vbool4 operator== (const vint4& a, const vint4& b); 1075 friend vbool4 operator!= (const vint4& a, const vint4& b); 1076 friend vbool4 operator< (const vint4& a, const vint4& b); 1077 friend vbool4 operator> (const vint4& a, const vint4& b); 1078 friend vbool4 operator>= (const vint4& a, const vint4& b); 1079 friend vbool4 operator<= (const vint4& a, const vint4& b); 1080 1081 /// Stream output 1082 friend std::ostream& operator<< (std::ostream& cout, const vint4 & a); 1083 1084 private: 1085 // The actual data representation 1086 union { 1087 simd_t m_simd; 1088 value_t m_val[elements]; 1089 }; 1090 }; 1091 1092 1093 1094 // Shift right logical -- unsigned shift. This differs from operator>> 1095 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but 1096 // srl((1<<31),1) == 1<<30. 1097 vint4 srl (const vint4& val, const unsigned int bits); 1098 1099 /// Helper: shuffle/swizzle with constant (templated) indices. 1100 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c) 1101 template<int i0, int i1, int i2, int i3> vint4 shuffle (const vint4& a); 1102 1103 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a) 1104 template<int i> vint4 shuffle (const vint4& a); 1105 1106 /// Helper: as rapid as possible extraction of one component, when the 1107 /// index is fixed. 1108 template<int i> int extract (const vint4& v); 1109 1110 /// The sum of all components, returned in all components. 1111 vint4 vreduce_add (const vint4& v); 1112 1113 // Reduction across all components 1114 int reduce_add (const vint4& v); 1115 int reduce_and (const vint4& v); 1116 int reduce_or (const vint4& v); 1117 1118 /// Use a bool mask to select between components of a (if mask[i] is false) 1119 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i]. 1120 vint4 blend (const vint4& a, const vint4& b, const vbool4& mask); 1121 1122 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if 1123 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to 1124 /// blend(0,a,mask). 1125 vint4 blend0 (const vint4& a, const vbool4& mask); 1126 1127 /// Use a bool mask to select between components of a (if mask[i] is false) 1128 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to 1129 /// blend(0,a,!mask), or blend(a,0,mask). 1130 vint4 blend0not (const vint4& a, const vbool4& mask); 1131 1132 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a 1133 /// synonym for blend with arguments rearranged, but this is more clear 1134 /// because the arguments are symmetric to scalar (cond ? a : b). 1135 vint4 select (const vbool4& mask, const vint4& a, const vint4& b); 1136 1137 // Per-element math 1138 vint4 abs (const vint4& a); 1139 vint4 min (const vint4& a, const vint4& b); 1140 vint4 max (const vint4& a, const vint4& b); 1141 1142 /// Circular bit rotate by s bits, for N values at once. 1143 vint4 rotl (const vint4& x, const int s); 1144 // DEPRECATED(2.1) 1145 vint4 rotl32 (const vint4& x, const unsigned int k); 1146 1147 /// andnot(a,b) returns ((~a) & b) 1148 vint4 andnot (const vint4& a, const vint4& b); 1149 1150 /// Bitcast back and forth to intN (not a convert -- move the bits!) 1151 vint4 bitcast_to_int (const vbool4& x); 1152 vint4 bitcast_to_int (const vfloat4& x); 1153 vfloat4 bitcast_to_float (const vint4& x); 1154 1155 void transpose (vint4 &a, vint4 &b, vint4 &c, vint4 &d); 1156 void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d, 1157 vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3); 1158 1159 vint4 AxBxCxDx (const vint4& a, const vint4& b, const vint4& c, const vint4& d); 1160 1161 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0. 1162 vint4 safe_mod (const vint4& a, const vint4& b); 1163 vint4 safe_mod (const vint4& a, int b); 1164 1165 1166 1167 1168 /// Integer 8-vector, accelerated by SIMD instructions when available. 1169 class vint8 { 1170 public: 1171 static const char* type_name() { return "vint8"; } 1172 typedef int value_t; ///< Underlying equivalent scalar value type 1173 enum { elements = 8 }; ///< Number of scalar elements 1174 enum { paddedelements =8 }; ///< Number of scalar elements for full pad 1175 enum { bits = elements*32 }; ///< Total number of bits 1176 typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used 1177 typedef vbool8 vbool_t; ///< bool type of the same length 1178 typedef vfloat8 vfloat_t; ///< float type of the same length 1179 typedef vint8 vint_t; ///< int type of the same length 1180 typedef vbool8 bool_t; // old name (deprecated 1.8) 1181 typedef vfloat8 float_t; // old name (deprecated 1.8) 1182 1183 /// Default constructor (contents undefined) 1184 vint8 () { } 1185 1186 /// Construct from a single value (store it in all slots) 1187 vint8 (int a); 1188 1189 /// Construct from 2 values -- (a,a,b,b) 1190 vint8 (int a, int b); 1191 1192 /// Construct from 8 values (won't work for vint8) 1193 vint8 (int a, int b, int c, int d, int e, int f, int g, int h); 1194 1195 /// Construct from a pointer to values 1196 vint8 (const int *vals); 1197 1198 /// Construct from a pointer to unsigned short values 1199 explicit vint8 (const unsigned short *vals); 1200 1201 /// Construct from a pointer to signed short values 1202 explicit vint8 (const short *vals); 1203 1204 /// Construct from a pointer to unsigned char values (0 - 255) 1205 explicit vint8 (const unsigned char *vals); 1206 1207 /// Construct from a pointer to signed char values (-128 - 127) 1208 explicit vint8 (const char *vals); 1209 1210 /// Copy construct from another vint8 1211 vint8 (const vint8 & other) { m_simd = other.m_simd; } 1212 1213 /// Convert a vfloat8 to an vint8. Equivalent to i = (int)f; 1214 explicit vint8 (const vfloat8& f); // implementation below 1215 1216 /// Construct from two vint4's 1217 vint8 (const vint4 &lo, const vint4 &hi); 1218 1219 /// Construct from the underlying SIMD type 1220 vint8 (const simd_t& m) : m_simd(m) { } 1221 1222 /// Return the raw SIMD type 1223 operator simd_t () const { return m_simd; } 1224 simd_t simd () const { return m_simd; } 1225 simd_t& simd () { return m_simd; } 1226 1227 /// Return a pointer to the underlying scalar type 1228 const value_t* data () const { return (const value_t*)this; } 1229 value_t* data () { return (value_t*)this; } 1230 1231 /// Sset all components to 0 1232 void clear () ; 1233 1234 /// Return an vint8 with all components set to 0 1235 static const vint8 Zero (); 1236 1237 /// Return an vint8 with all components set to 1 1238 static const vint8 One (); 1239 1240 /// Return an vint8 with all components set to -1 (aka 0xffffffff) 1241 static const vint8 NegOne (); 1242 1243 /// Return an vint8 with incremented components (e.g., 0,1,2,3). 1244 /// Optional arguments can give a non-zero starting point and step size. 1245 static const vint8 Iota (int start=0, int step=1); 1246 1247 /// Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...). 1248 static const vint8 Giota (); 1249 1250 /// Assign one value to all components. 1251 const vint8 & operator= (int a); 1252 1253 /// Assignment from another vint8 1254 const vint8 & operator= (const vint8& other) ; 1255 1256 /// Component access (get) 1257 int operator[] (int i) const; 1258 1259 /// Component access (set) 1260 int& operator[] (int i); 1261 1262 /// Component access (set). 1263 void setcomp (int i, int value); 1264 1265 value_t x () const; 1266 value_t y () const; 1267 value_t z () const; 1268 value_t w () const; 1269 void set_x (value_t val); 1270 void set_y (value_t val); 1271 void set_z (value_t val); 1272 void set_w (value_t val); 1273 1274 /// Extract the lower precision vint4 1275 vint4 lo () const; 1276 1277 /// Extract the higher precision vint4 1278 vint4 hi () const; 1279 1280 /// Helper: load a single int into all components 1281 void load (int a); 1282 1283 /// Load separate values into each component. 1284 void load (int a, int b, int c, int d, int e, int f, int g, int h); 1285 1286 /// Load from an array of 8 values 1287 void load (const int *values); 1288 1289 void load (const int *values, int n) ; 1290 1291 /// Load from an array of 8 unsigned short values, convert to vint8 1292 void load (const unsigned short *values) ; 1293 1294 /// Load from an array of 8 unsigned short values, convert to vint8 1295 void load (const short *values); 1296 1297 /// Load from an array of 8 unsigned char values, convert to vint8 1298 void load (const unsigned char *values); 1299 1300 /// Load from an array of 8 unsigned char values, convert to vint8 1301 void load (const char *values); 1302 1303 /// Store the values into memory 1304 void store (int *values) const; 1305 1306 /// Store the first n values into memory 1307 void store (int *values, int n) const; 1308 1309 /// Store the least significant 16 bits of each element into adjacent 1310 /// unsigned shorts. 1311 void store (unsigned short *values) const; 1312 1313 /// Store the least significant 8 bits of each element into adjacent 1314 /// unsigned chars. 1315 void store (unsigned char *values) const; 1316 1317 /// Masked load -- read from values[] where mask is 1, load zero where 1318 /// mask is 0. 1319 void load_mask (int mask, const value_t *values); 1320 void load_mask (const vbool_t& mask, const value_t *values); 1321 1322 /// Masked store -- write to values[] where mask is enabled, don't 1323 /// touch values[] where it's not. 1324 void store_mask (int mask, value_t *values) const; 1325 void store_mask (const vbool_t& mask, value_t *values) const; 1326 1327 /// Load values from addresses (char*)basepatr + vindex[i]*scale 1328 template<int scale=4> 1329 void gather (const value_t *baseptr, const vint_t& vindex); 1330 /// Gather elements defined by the mask, leave others unchanged. 1331 template<int scale=4> 1332 void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex); 1333 template<int scale=4> 1334 void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex); 1335 1336 /// Store values at addresses (char*)basepatr + vindex[i]*scale 1337 template<int scale=4> 1338 void scatter (value_t *baseptr, const vint_t& vindex) const; 1339 /// Scatter elements defined by the mask 1340 template<int scale=4> 1341 void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const; 1342 template<int scale=4> 1343 void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const; 1344 1345 // Arithmetic operators (component-by-component) 1346 friend vint8 operator+ (const vint8& a, const vint8& b); 1347 friend vint8 operator- (const vint8& a); 1348 friend vint8 operator- (const vint8& a, const vint8& b); 1349 friend vint8 operator* (const vint8& a, const vint8& b); 1350 friend vint8 operator/ (const vint8& a, const vint8& b); 1351 friend vint8 operator% (const vint8& a, const vint8& b); 1352 friend const vint8 & operator+= (vint8& a, const vint8& b); 1353 friend const vint8 & operator-= (vint8& a, const vint8& b); 1354 friend const vint8 & operator*= (vint8& a, const vint8& b); 1355 friend const vint8 & operator/= (vint8& a, const vint8& b); 1356 friend const vint8 & operator%= (vint8& a, const vint8& b); 1357 // Bitwise operators (component-by-component) 1358 friend vint8 operator& (const vint8& a, const vint8& b); 1359 friend vint8 operator| (const vint8& a, const vint8& b); 1360 friend vint8 operator^ (const vint8& a, const vint8& b); 1361 friend const vint8& operator&= (vint8& a, const vint8& b); 1362 friend const vint8& operator|= (vint8& a, const vint8& b); 1363 friend const vint8& operator^= (vint8& a, const vint8& b); 1364 friend vint8 operator~ (const vint8& a); 1365 friend vint8 operator<< (const vint8& a, unsigned int bits); 1366 friend vint8 operator>> (const vint8& a, unsigned int bits); 1367 friend const vint8& operator<<= (vint8& a, unsigned int bits); 1368 friend const vint8& operator>>= (vint8& a, unsigned int bits); 1369 // Comparison operators (component-by-component) 1370 friend vbool8 operator== (const vint8& a, const vint8& b); 1371 friend vbool8 operator!= (const vint8& a, const vint8& b); 1372 friend vbool8 operator< (const vint8& a, const vint8& b); 1373 friend vbool8 operator> (const vint8& a, const vint8& b); 1374 friend vbool8 operator>= (const vint8& a, const vint8& b); 1375 friend vbool8 operator<= (const vint8& a, const vint8& b); 1376 1377 /// Stream output 1378 friend std::ostream& operator<< (std::ostream& cout, const vint8& a); 1379 1380 private: 1381 // The actual data representation 1382 union { 1383 simd_t m_simd; 1384 value_t m_val[elements]; 1385 vint4 m_4[2]; 1386 }; 1387 }; 1388 1389 1390 1391 // Shift right logical -- unsigned shift. This differs from operator>> 1392 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but 1393 // srl((1<<31),1) == 1<<30. 1394 vint8 srl (const vint8& val, const unsigned int bits); 1395 1396 /// Helper: shuffle/swizzle with constant (templated) indices. 1397 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c) 1398 template<int i0, int i1, int i2, int i3, 1399 int i4, int i5, int i6, int i7> vint8 shuffle (const vint8& a); 1400 1401 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a) 1402 template<int i> vint8 shuffle (const vint8& a); 1403 1404 /// Helper: as rapid as possible extraction of one component, when the 1405 /// index is fixed. 1406 template<int i> int extract (const vint8& v); 1407 1408 /// Helper: substitute val for a[i] 1409 template<int i> vint8 insert (const vint8& a, int val); 1410 1411 /// The sum of all components, returned in all components. 1412 vint8 vreduce_add (const vint8& v); 1413 1414 // Reduction across all components 1415 int reduce_add (const vint8& v); 1416 int reduce_and (const vint8& v); 1417 int reduce_or (const vint8& v); 1418 1419 /// Use a bool mask to select between components of a (if mask[i] is false) 1420 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i]. 1421 vint8 blend (const vint8& a, const vint8& b, const vbool8& mask); 1422 1423 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if 1424 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to 1425 /// blend(0,a,mask). 1426 vint8 blend0 (const vint8& a, const vbool8& mask); 1427 1428 /// Use a bool mask to select between components of a (if mask[i] is false) 1429 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to 1430 /// blend(0,a,!mask), or blend(a,0,mask). 1431 vint8 blend0not (const vint8& a, const vbool8& mask); 1432 1433 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a 1434 /// synonym for blend with arguments rearranged, but this is more clear 1435 /// because the arguments are symmetric to scalar (cond ? a : b). 1436 vint8 select (const vbool8& mask, const vint8& a, const vint8& b); 1437 1438 // Per-element math 1439 vint8 abs (const vint8& a); 1440 vint8 min (const vint8& a, const vint8& b); 1441 vint8 max (const vint8& a, const vint8& b); 1442 1443 /// Circular bit rotate by s bits, for N values at once. 1444 vint8 rotl (const vint8& x, const int s); 1445 // DEPRECATED(2.1) 1446 vint8 rotl32 (const vint8& x, const unsigned int k); 1447 1448 /// andnot(a,b) returns ((~a) & b) 1449 vint8 andnot (const vint8& a, const vint8& b); 1450 1451 /// Bitcast back and forth to intN (not a convert -- move the bits!) 1452 vint8 bitcast_to_int (const vbool8& x); 1453 vint8 bitcast_to_int (const vfloat8& x); 1454 vfloat8 bitcast_to_float (const vint8& x); 1455 1456 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0. 1457 vint8 safe_mod (const vint8& a, const vint8& b); 1458 vint8 safe_mod (const vint8& a, int b); 1459 1460 1461 1462 1463 1464 /// Integer 16-vector, accelerated by SIMD instructions when available. 1465 class vint16 { 1466 public: 1467 static const char* type_name() { return "vint16"; } 1468 typedef int value_t; ///< Underlying equivalent scalar value type 1469 enum { elements = 16 }; ///< Number of scalar elements 1470 enum { paddedelements =16 }; ///< Number of scalar elements for full pad 1471 enum { bits = 128 }; ///< Total number of bits 1472 typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used 1473 typedef vbool16 vbool_t; ///< bool type of the same length 1474 typedef vfloat16 vfloat_t; ///< float type of the same length 1475 typedef vint16 vint_t; ///< int type of the same length 1476 typedef vbool16 bool_t; // old name (deprecated 1.8) 1477 typedef vfloat16 float_t; // old name (deprecated 1.8) 1478 1479 /// Default constructor (contents undefined) 1480 vint16 () { } 1481 1482 /// Construct from a single value (store it in all slots) 1483 vint16 (int a); 1484 1485 /// Construct from 16 values (won't work for vint16) 1486 vint16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7, 1487 int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15); 1488 1489 /// Construct from a pointer to values 1490 vint16 (const int *vals); 1491 1492 /// Construct from a pointer to unsigned short values 1493 explicit vint16 (const unsigned short *vals); 1494 1495 /// Construct from a pointer to signed short values 1496 explicit vint16 (const short *vals); 1497 1498 /// Construct from a pointer to unsigned char values (0 - 255) 1499 explicit vint16 (const unsigned char *vals); 1500 1501 /// Construct from a pointer to signed char values (-128 - 127) 1502 explicit vint16 (const char *vals); 1503 1504 /// Copy construct from another vint16 1505 vint16 (const vint16 & other) { m_simd = other.m_simd; } 1506 1507 /// Convert a vfloat16 to an vint16. Equivalent to i = (int)f; 1508 explicit vint16 (const vfloat16& f); // implementation below 1509 1510 /// Construct from two vint8's 1511 vint16 (const vint8 &lo, const vint8 &hi); 1512 1513 /// Construct from four vint4's 1514 vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d); 1515 1516 /// Construct from the underlying SIMD type 1517 vint16 (const simd_t& m) : m_simd(m) { } 1518 1519 /// Return the raw SIMD type 1520 operator simd_t () const { return m_simd; } 1521 simd_t simd () const { return m_simd; } 1522 simd_t& simd () { return m_simd; } 1523 1524 /// Return a pointer to the underlying scalar type 1525 const value_t* data () const { return (const value_t*)this; } 1526 value_t* data () { return (value_t*)this; } 1527 1528 /// Sset all components to 0 1529 void clear () ; 1530 1531 /// Return an vint16 with all components set to 0 1532 static const vint16 Zero (); 1533 1534 /// Return an vint16 with all components set to 1 1535 static const vint16 One (); 1536 1537 /// Return an vint16 with all components set to -1 (aka 0xffffffff) 1538 static const vint16 NegOne (); 1539 1540 /// Return an vint16 with incremented components (e.g., 0,1,2,3). 1541 /// Optional arguments can give a non-zero starting point and step size. 1542 static const vint16 Iota (int start=0, int step=1); 1543 1544 /// Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...). 1545 static const vint16 Giota (); 1546 1547 /// Assign one value to all components. 1548 const vint16 & operator= (int a); 1549 1550 /// Assignment from another vint16 1551 const vint16 & operator= (const vint16& other) ; 1552 1553 /// Component access (get) 1554 int operator[] (int i) const; 1555 1556 /// Component access (set) 1557 int& operator[] (int i); 1558 1559 /// Component access (set). 1560 void setcomp (int i, int value); 1561 1562 value_t x () const; 1563 value_t y () const; 1564 value_t z () const; 1565 value_t w () const; 1566 void set_x (value_t val); 1567 void set_y (value_t val); 1568 void set_z (value_t val); 1569 void set_w (value_t val); 1570 1571 /// Extract the lower precision vint8 1572 vint8 lo () const; 1573 1574 /// Extract the higher precision vint8 1575 vint8 hi () const; 1576 1577 /// Helper: load a single int into all components 1578 void load (int a); 1579 1580 /// Load separate values into each component. 1581 void load (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7, 1582 int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15); 1583 1584 /// Load from an array of 16 values 1585 void load (const int *values); 1586 1587 void load (const int *values, int n) ; 1588 1589 /// Load from an array of 16 unsigned short values, convert to vint16 1590 void load (const unsigned short *values) ; 1591 1592 /// Load from an array of 16 unsigned short values, convert to vint16 1593 void load (const short *values); 1594 1595 /// Load from an array of 16 unsigned char values, convert to vint16 1596 void load (const unsigned char *values); 1597 1598 /// Load from an array of 16 unsigned char values, convert to vint16 1599 void load (const char *values); 1600 1601 /// Store the values into memory 1602 void store (int *values) const; 1603 1604 /// Store the first n values into memory 1605 void store (int *values, int n) const; 1606 1607 /// Store the least significant 16 bits of each element into adjacent 1608 /// unsigned shorts. 1609 void store (unsigned short *values) const; 1610 1611 /// Store the least significant 8 bits of each element into adjacent 1612 /// unsigned chars. 1613 void store (unsigned char *values) const; 1614 1615 /// Masked load -- read from values[] where mask is 1, load zero where 1616 /// mask is 0. 1617 void load_mask (const vbool_t &mask, const value_t *values); 1618 void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); } 1619 1620 /// Masked store -- write to values[] where mask is enabled, don't 1621 /// touch values[] where it's not. 1622 void store_mask (const vbool_t &mask, value_t *values) const; 1623 void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); } 1624 1625 /// Load values from addresses (char*)basepatr + vindex[i]*scale 1626 template<int scale=4> 1627 void gather (const value_t *baseptr, const vint_t& vindex); 1628 /// Gather elements defined by the mask, leave others unchanged. 1629 template<int scale=4> 1630 void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex); 1631 template<int scale=4> 1632 void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) { 1633 gather_mask<scale> (vbool_t(mask), baseptr, vindex); 1634 } 1635 1636 /// Store values at addresses (char*)basepatr + vindex[i]*scale 1637 template<int scale=4> 1638 void scatter (value_t *baseptr, const vint_t& vindex) const; 1639 /// Scatter elements defined by the mask 1640 template<int scale=4> 1641 void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const; 1642 template<int scale=4> 1643 void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const { 1644 scatter_mask<scale> (vbool_t(mask), baseptr, vindex); 1645 } 1646 1647 // Arithmetic operators (component-by-component) 1648 friend vint16 operator+ (const vint16& a, const vint16& b); 1649 friend vint16 operator- (const vint16& a); 1650 friend vint16 operator- (const vint16& a, const vint16& b); 1651 friend vint16 operator* (const vint16& a, const vint16& b); 1652 friend vint16 operator/ (const vint16& a, const vint16& b); 1653 friend vint16 operator% (const vint16& a, const vint16& b); 1654 friend const vint16 & operator+= (vint16& a, const vint16& b); 1655 friend const vint16 & operator-= (vint16& a, const vint16& b); 1656 friend const vint16 & operator*= (vint16& a, const vint16& b); 1657 friend const vint16 & operator/= (vint16& a, const vint16& b); 1658 friend const vint16 & operator%= (vint16& a, const vint16& b); 1659 // Bitwise operators (component-by-component) 1660 friend vint16 operator& (const vint16& a, const vint16& b); 1661 friend vint16 operator| (const vint16& a, const vint16& b); 1662 friend vint16 operator^ (const vint16& a, const vint16& b); 1663 friend const vint16& operator&= (vint16& a, const vint16& b); 1664 friend const vint16& operator|= (vint16& a, const vint16& b); 1665 friend const vint16& operator^= (vint16& a, const vint16& b); 1666 friend vint16 operator~ (const vint16& a); 1667 friend vint16 operator<< (const vint16& a, unsigned int bits); 1668 friend vint16 operator>> (const vint16& a, unsigned int bits); 1669 friend const vint16& operator<<= (vint16& a, unsigned int bits); 1670 friend const vint16& operator>>= (vint16& a, unsigned int bits); 1671 // Comparison operators (component-by-component) 1672 friend vbool16 operator== (const vint16& a, const vint16& b); 1673 friend vbool16 operator!= (const vint16& a, const vint16& b); 1674 friend vbool16 operator< (const vint16& a, const vint16& b); 1675 friend vbool16 operator> (const vint16& a, const vint16& b); 1676 friend vbool16 operator>= (const vint16& a, const vint16& b); 1677 friend vbool16 operator<= (const vint16& a, const vint16& b); 1678 1679 /// Stream output 1680 friend std::ostream& operator<< (std::ostream& cout, const vint16& a); 1681 1682 private: 1683 // The actual data representation 1684 union { 1685 simd_t m_simd; 1686 value_t m_val[elements]; 1687 vint8 m_8[2]; 1688 }; 1689 }; 1690 1691 1692 1693 /// Shift right logical -- unsigned shift. This differs from operator>> 1694 /// in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but 1695 /// srl((1<<31),1) == 1<<30. 1696 vint16 srl (const vint16& val, const unsigned int bits); 1697 1698 /// Shuffle groups of 4 1699 template<int i0, int i1, int i2, int i3> 1700 vint16 shuffle4 (const vint16& a); 1701 1702 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a) 1703 template<int i> vint16 shuffle4 (const vint16& a); 1704 1705 /// Shuffle within each group of 4 1706 template<int i0, int i1, int i2, int i3> 1707 vint16 shuffle (const vint16& a); 1708 1709 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a) 1710 template<int i> vint16 shuffle (const vint16& a); 1711 1712 /// Helper: as rapid as possible extraction of one component, when the 1713 /// index is fixed. 1714 template<int i> int extract (const vint16& v); 1715 1716 /// Helper: substitute val for a[i] 1717 template<int i> vint16 insert (const vint16& a, int val); 1718 1719 /// The sum of all components, returned in all components. 1720 vint16 vreduce_add (const vint16& v); 1721 1722 // Reduction across all components 1723 int reduce_add (const vint16& v); 1724 int reduce_and (const vint16& v); 1725 int reduce_or (const vint16& v); 1726 1727 /// Use a bool mask to select between components of a (if mask[i] is false) 1728 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i]. 1729 vint16 blend (const vint16& a, const vint16& b, const vbool16& mask); 1730 1731 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if 1732 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to 1733 /// blend(0,a,mask). 1734 vint16 blend0 (const vint16& a, const vbool16& mask); 1735 1736 /// Use a bool mask to select between components of a (if mask[i] is false) 1737 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to 1738 /// blend(0,a,!mask), or blend(a,0,mask). 1739 vint16 blend0not (const vint16& a, const vbool16& mask); 1740 1741 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a 1742 /// synonym for blend with arguments rearranged, but this is more clear 1743 /// because the arguments are symmetric to scalar (cond ? a : b). 1744 vint16 select (const vbool16& mask, const vint16& a, const vint16& b); 1745 1746 // Per-element math 1747 vint16 abs (const vint16& a); 1748 vint16 min (const vint16& a, const vint16& b); 1749 vint16 max (const vint16& a, const vint16& b); 1750 1751 /// Circular bit rotate by s bits, for N values at once. 1752 vint16 rotl (const vint16& x, const int s); 1753 // DEPRECATED(2.1) 1754 vint16 rotl32 (const vint16& x, const unsigned int k); 1755 1756 /// andnot(a,b) returns ((~a) & b) 1757 vint16 andnot (const vint16& a, const vint16& b); 1758 1759 /// Bitcast back and forth to intN (not a convert -- move the bits!) 1760 vint16 bitcast_to_int (const vbool16& x); 1761 vint16 bitcast_to_int (const vfloat16& x); 1762 vfloat16 bitcast_to_float (const vint16& x); 1763 1764 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0. 1765 vint16 safe_mod (const vint16& a, const vint16& b); 1766 vint16 safe_mod (const vint16& a, int b); 1767 1768 1769 1770 1771 1772 /// Floating point 4-vector, accelerated by SIMD instructions when 1773 /// available. 1774 class vfloat4 { 1775 public: 1776 static const char* type_name() { return "vfloat4"; } 1777 typedef float value_t; ///< Underlying equivalent scalar value type 1778 enum { elements = 4 }; ///< Number of scalar elements 1779 enum { paddedelements = 4 }; ///< Number of scalar elements for full pad 1780 enum { bits = elements*32 }; ///< Total number of bits 1781 typedef simd_raw_t<float,4>::type simd_t; ///< the native SIMD type used 1782 typedef vfloat4 vfloat_t; ///< SIMD int type 1783 typedef vint4 vint_t; ///< SIMD int type 1784 typedef vbool4 vbool_t; ///< SIMD bool type 1785 typedef vint4 int_t; // old name (deprecated 1.8) 1786 typedef vbool4 bool_t; // old name (deprecated 1.8) 1787 1788 /// Default constructor (contents undefined) 1789 vfloat4 () { } 1790 1791 /// Construct from a single value (store it in all slots) 1792 vfloat4 (float a) { load(a); } 1793 1794 /// Construct from 3 or 4 values 1795 vfloat4 (float a, float b, float c, float d=0.0f) { load(a,b,c,d); } 1796 1797 /// Construct from a pointer to 4 values 1798 vfloat4 (const float *f) { load (f); } 1799 1800 /// Copy construct from another vfloat4 1801 vfloat4 (const vfloat4 &other) { m_simd = other.m_simd; } 1802 1803 /// Construct from an vint4 (promoting all components to float) 1804 explicit vfloat4 (const vint4& ival); 1805 1806 /// Construct from the underlying SIMD type 1807 vfloat4 (const simd_t& m) : m_simd(m) { } 1808 1809 /// Return the raw SIMD type 1810 operator simd_t () const { return m_simd; } 1811 simd_t simd () const { return m_simd; } 1812 simd_t& simd () { return m_simd; } 1813 1814 /// Return a pointer to the underlying scalar type 1815 const value_t* data () const { return (const value_t*)this; } 1816 value_t* data () { return (value_t*)this; } 1817 1818 /// Construct from a Imath::V3f 1819 explicit vfloat4 (const Imath::V3f &v) { load (v[0], v[1], v[2]); } 1820 1821 /// Cast to a Imath::V3f 1822 const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; } 1823 1824 /// Construct from a Imath::V4f 1825 explicit vfloat4 (const Imath::V4f &v) { load ((const float *)&v); } 1826 1827 /// Cast to a Imath::V4f 1828 const Imath::V4f& V4f () const { return *(const Imath::V4f*)this; } 1829 1830 /// Construct from a pointer to 4 unsigned short values 1831 explicit vfloat4 (const unsigned short *vals) { load(vals); } 1832 1833 /// Construct from a pointer to 4 short values 1834 explicit vfloat4 (const short *vals) { load(vals); } 1835 1836 /// Construct from a pointer to 4 unsigned char values 1837 explicit vfloat4 (const unsigned char *vals) { load(vals); } 1838 1839 /// Construct from a pointer to 4 char values 1840 explicit vfloat4 (const char *vals) { load(vals); } 1841 1842 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 1843 /// Construct from a pointer to 4 half (16 bit float) values 1844 explicit vfloat4 (const half *vals) { load(vals); } 1845 #endif 1846 1847 /// Assign a single value to all components 1848 const vfloat4 & operator= (float a) { load(a); return *this; } 1849 1850 /// Assign a vfloat4 1851 const vfloat4 & operator= (vfloat4 other) { 1852 m_simd = other.m_simd; 1853 return *this; 1854 } 1855 1856 /// Return a vfloat4 with all components set to 0.0 1857 static const vfloat4 Zero (); 1858 1859 /// Return a vfloat4 with all components set to 1.0 1860 static const vfloat4 One (); 1861 1862 /// Return a vfloat4 with incremented components (e.g., 0.0,1.0,2.0,3.0). 1863 /// Optional argument can give a non-zero starting point and non-1 step. 1864 static const vfloat4 Iota (float start=0.0f, float step=1.0f); 1865 1866 /// Set all components to 0.0 1867 void clear (); 1868 1869 /// Assign from a Imath::V4f 1870 const vfloat4 & operator= (const Imath::V4f &v); 1871 1872 /// Assign from a Imath::V3f 1873 const vfloat4 & operator= (const Imath::V3f &v); 1874 1875 /// Component access (get) 1876 float operator[] (int i) const; 1877 /// Component access (set) 1878 float& operator[] (int i); 1879 1880 /// Component access (set). 1881 void setcomp (int i, float value); 1882 1883 value_t x () const; 1884 value_t y () const; 1885 value_t z () const; 1886 value_t w () const; 1887 void set_x (value_t val); 1888 void set_y (value_t val); 1889 void set_z (value_t val); 1890 void set_w (value_t val); 1891 1892 /// Helper: load a single value into all components 1893 void load (float val); 1894 1895 /// Helper: load 3 or 4 values. (If 3 are supplied, the 4th will be 0.) 1896 void load (float a, float b, float c, float d=0.0f); 1897 1898 /// Load from an array of 4 values 1899 void load (const float *values); 1900 1901 /// Load from a partial array of <=4 values. Unassigned values are 1902 /// undefined. 1903 void load (const float *values, int n); 1904 1905 /// Load from an array of 4 unsigned short values, convert to float 1906 void load (const unsigned short *values); 1907 1908 /// Load from an array of 4 short values, convert to float 1909 void load (const short *values); 1910 1911 /// Load from an array of 4 unsigned char values, convert to float 1912 void load (const unsigned char *values); 1913 1914 /// Load from an array of 4 char values, convert to float 1915 void load (const char *values); 1916 1917 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 1918 /// Load from an array of 4 half values, convert to float 1919 void load (const half *values); 1920 #endif /* _HALF_H_ or _IMATH_H_ */ 1921 1922 void store (float *values) const; 1923 1924 /// Store the first n values into memory 1925 void store (float *values, int n) const; 1926 1927 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 1928 void store (half *values) const; 1929 #endif 1930 1931 /// Masked load -- read from values[] where mask is 1, load zero where 1932 /// mask is 0. 1933 void load_mask (int mask, const value_t *values); 1934 void load_mask (const vbool_t& mask, const value_t *values); 1935 1936 /// Masked store -- write to values[] where mask is enabled, don't 1937 /// touch values[] where it's not. 1938 void store_mask (int mask, value_t *values) const; 1939 void store_mask (const vbool_t& mask, value_t *values) const; 1940 1941 /// Load values from addresses (char*)basepatr + vindex[i]*scale 1942 template<int scale=4> 1943 void gather (const value_t *baseptr, const vint_t& vindex); 1944 /// Gather elements defined by the mask, leave others unchanged. 1945 template<int scale=4> 1946 void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex); 1947 template<int scale=4> 1948 void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex); 1949 1950 /// Store values at addresses (char*)basepatr + vindex[i]*scale 1951 template<int scale=4> 1952 void scatter (value_t *baseptr, const vint_t& vindex) const; 1953 /// Scatter elements defined by the mask 1954 template<int scale=4> 1955 void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const; 1956 template<int scale=4> 1957 void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const; 1958 1959 // Arithmetic operators 1960 friend vfloat4 operator+ (const vfloat4& a, const vfloat4& b); 1961 const vfloat4 & operator+= (const vfloat4& a); 1962 vfloat4 operator- () const; 1963 friend vfloat4 operator- (const vfloat4& a, const vfloat4& b); 1964 const vfloat4 & operator-= (const vfloat4& a); 1965 friend vfloat4 operator* (const vfloat4& a, const vfloat4& b); 1966 friend vfloat4 operator* (const vfloat4& a, float b); 1967 friend vfloat4 operator* (float a, const vfloat4& b); 1968 const vfloat4 & operator*= (const vfloat4& a); 1969 const vfloat4 & operator*= (float val); 1970 friend vfloat4 operator/ (const vfloat4& a, const vfloat4& b); 1971 const vfloat4 & operator/= (const vfloat4& a); 1972 const vfloat4 & operator/= (float val); 1973 1974 // Comparison operations 1975 friend vbool4 operator== (const vfloat4& a, const vfloat4& b); 1976 friend vbool4 operator!= (const vfloat4& a, const vfloat4& b); 1977 friend vbool4 operator< (const vfloat4& a, const vfloat4& b); 1978 friend vbool4 operator> (const vfloat4& a, const vfloat4& b); 1979 friend vbool4 operator>= (const vfloat4& a, const vfloat4& b); 1980 friend vbool4 operator<= (const vfloat4& a, const vfloat4& b); 1981 1982 // Some oddball items that are handy 1983 1984 /// Combine the first two components of A with the first two components 1985 /// of B. 1986 friend vfloat4 AxyBxy (const vfloat4& a, const vfloat4& b); 1987 1988 /// Combine the first two components of A with the first two components 1989 /// of B, but interleaved. 1990 friend vfloat4 AxBxAyBy (const vfloat4& a, const vfloat4& b); 1991 1992 /// Return xyz components, plus 0 for w 1993 vfloat4 xyz0 () const; 1994 1995 /// Return xyz components, plus 1 for w 1996 vfloat4 xyz1 () const; 1997 1998 /// Stream output 1999 friend inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val); 2000 2001 protected: 2002 // The actual data representation 2003 union { 2004 simd_t m_simd; 2005 value_t m_val[paddedelements]; 2006 }; 2007 }; 2008 2009 2010 /// Helper: shuffle/swizzle with constant (templated) indices. 2011 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c) 2012 template<int i0, int i1, int i2, int i3> vfloat4 shuffle (const vfloat4& a); 2013 2014 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a) 2015 template<int i> vfloat4 shuffle (const vfloat4& a); 2016 2017 /// Helper: as rapid as possible extraction of one component, when the 2018 /// index is fixed. 2019 template<int i> float extract (const vfloat4& a); 2020 2021 /// Helper: substitute val for a[i] 2022 template<int i> vfloat4 insert (const vfloat4& a, float val); 2023 2024 /// The sum of all components, returned in all components. 2025 vfloat4 vreduce_add (const vfloat4& v); 2026 2027 /// The sum of all components, returned as a scalar. 2028 float reduce_add (const vfloat4& v); 2029 2030 /// Return the float dot (inner) product of a and b in every component. 2031 vfloat4 vdot (const vfloat4 &a, const vfloat4 &b); 2032 2033 /// Return the float dot (inner) product of a and b. 2034 float dot (const vfloat4 &a, const vfloat4 &b); 2035 2036 /// Return the float 3-component dot (inner) product of a and b in 2037 /// all components. 2038 vfloat4 vdot3 (const vfloat4 &a, const vfloat4 &b); 2039 2040 /// Return the float 3-component dot (inner) product of a and b. 2041 float dot3 (const vfloat4 &a, const vfloat4 &b); 2042 2043 /// Use a bool mask to select between components of a (if mask[i] is false) 2044 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i]. 2045 vfloat4 blend (const vfloat4& a, const vfloat4& b, const vbool4& mask); 2046 2047 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if 2048 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to 2049 /// blend(0,a,mask). 2050 vfloat4 blend0 (const vfloat4& a, const vbool4& mask); 2051 2052 /// Use a bool mask to select between components of a (if mask[i] is false) 2053 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to 2054 /// blend(0,a,!mask), or blend(a,0,mask). 2055 vfloat4 blend0not (const vfloat4& a, const vbool4& mask); 2056 2057 /// "Safe" divide of vfloat4/vfloat4 -- for any component of the divisor 2058 /// that is 0, return 0 rather than Inf. 2059 vfloat4 safe_div (const vfloat4 &a, const vfloat4 &b); 2060 2061 /// Homogeneous divide to turn a vfloat4 into a vfloat3. 2062 vfloat3 hdiv (const vfloat4 &a); 2063 2064 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a 2065 /// synonym for blend with arguments rearranged, but this is more clear 2066 /// because the arguments are symmetric to scalar (cond ? a : b). 2067 vfloat4 select (const vbool4& mask, const vfloat4& a, const vfloat4& b); 2068 2069 // Per-element math 2070 vfloat4 abs (const vfloat4& a); ///< absolute value (float) 2071 vfloat4 sign (const vfloat4& a); ///< 1.0 when value >= 0, -1 when negative 2072 vfloat4 ceil (const vfloat4& a); 2073 vfloat4 floor (const vfloat4& a); 2074 vint4 ifloor (const vfloat4& a); ///< (int)floor 2075 inline vint4 floori (const vfloat4& a) { return ifloor(a); } // DEPRECATED(1.8) alias 2076 2077 /// Per-element round to nearest integer. 2078 /// CAVEAT: the rounding when mid-way between integers may differ depending 2079 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even 2080 /// integer) but std::round() says to round away from 0 regardless of 2081 /// current rounding mode (but that is multiple instructions on x64). 2082 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly 2083 /// match std::round(). 2084 vfloat4 round (const vfloat4& a); 2085 2086 /// Per-element round to nearest integer (equivalent to vint(round(a))). 2087 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from 2088 /// C++ std::rint() which says to use the current rounding mode. 2089 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly 2090 /// match std::rint(). 2091 vint4 rint (const vfloat4& a); 2092 2093 vfloat4 rcp_fast (const vfloat4 &a); ///< Fast, approximate 1/a 2094 vfloat4 sqrt (const vfloat4 &a); 2095 vfloat4 rsqrt (const vfloat4 &a); ///< Fully accurate 1/sqrt 2096 vfloat4 rsqrt_fast (const vfloat4 &a); ///< Fast, approximate 1/sqrt 2097 vfloat4 min (const vfloat4& a, const vfloat4& b); ///< Per-element min 2098 vfloat4 max (const vfloat4& a, const vfloat4& b); ///< Per-element max 2099 template <typename T> T exp (const T& v); // template for all SIMD variants 2100 template <typename T> T log (const T& v); 2101 2102 /// andnot(a,b) returns ((~a) & b) 2103 vfloat4 andnot (const vfloat4& a, const vfloat4& b); 2104 2105 // Fused multiply and add (or subtract): 2106 vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b + c 2107 vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b - c 2108 vfloat4 nmadd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b + c 2109 vfloat4 nmsub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b - c 2110 2111 /// Transpose the rows and columns of the 4x4 matrix [a b c d]. 2112 /// In the end, a will have the original (a[0], b[0], c[0], d[0]), 2113 /// b will have the original (a[1], b[1], c[1], d[1]), and so on. 2114 void transpose (vfloat4 &a, vfloat4 &b, vfloat4 &c, vfloat4 &d); 2115 void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d, 2116 vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3); 2117 2118 /// Make a vfloat4 consisting of the first element of each of 4 vfloat4's. 2119 vfloat4 AxBxCxDx (const vfloat4& a, const vfloat4& b, 2120 const vfloat4& c, const vfloat4& d); 2121 2122 2123 2124 /// Floating point 3-vector, aligned to be internally identical to a vfloat4. 2125 /// The way it differs from vfloat4 is that all of he load functions only 2126 /// load three values, and all the stores only store 3 values. The vast 2127 /// majority of ops just fall back to the vfloat4 version, and so will 2128 /// operate on the 4th component, but we won't care about that results. 2129 class vfloat3 : public vfloat4 { 2130 public: 2131 static const char* type_name() { return "vfloat3"; } 2132 enum { elements = 3 }; ///< Number of scalar elements 2133 enum { paddedelements = 4 }; ///< Number of scalar elements for full pad 2134 2135 /// Default constructor (contents undefined) 2136 vfloat3 () { } 2137 2138 /// Construct from a single value (store it in all slots) 2139 vfloat3 (float a) { load(a); } 2140 2141 /// Construct from 3 or 4 values 2142 vfloat3 (float a, float b, float c) { vfloat4::load(a,b,c); } 2143 2144 /// Construct from a pointer to 4 values 2145 vfloat3 (const float *f) { load (f); } 2146 2147 /// Copy construct from another vfloat3 2148 vfloat3 (const vfloat3 &other); 2149 2150 /// Construct from a vfloat4. Note: it will not zero out the internal 2151 /// 4th component, but rather accept on faith that the vfloat4 you are 2152 /// giving it is a valid vfloat3. Be careful! 2153 explicit vfloat3 (const vfloat4 &other); 2154 2155 #if OIIO_SIMD 2156 /// Construct from the underlying SIMD type. Note: it will not zero out 2157 /// the internal 4th component, but rather accept on faith that the 2158 /// vfloat4 you are giving it is a valid vfloat3. Be careful! 2159 explicit vfloat3 (const simd_t& m) : vfloat4(m) { } 2160 #endif 2161 2162 /// Construct from a Imath::V3f 2163 vfloat3 (const Imath::V3f &v) : vfloat4(v) { } 2164 2165 /// Cast to a Imath::V3f 2166 const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; } 2167 2168 /// Construct from a pointer to 4 unsigned short values 2169 explicit vfloat3 (const unsigned short *vals) { load(vals); } 2170 2171 /// Construct from a pointer to 4 short values 2172 explicit vfloat3 (const short *vals) { load(vals); } 2173 2174 /// Construct from a pointer to 4 unsigned char values 2175 explicit vfloat3 (const unsigned char *vals) { load(vals); } 2176 2177 /// Construct from a pointer to 4 char values 2178 explicit vfloat3 (const char *vals) { load(vals); } 2179 2180 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 2181 /// Construct from a pointer to 4 half (16 bit float) values 2182 explicit vfloat3 (const half *vals) { load(vals); } 2183 #endif 2184 2185 /// Assign a single value to all components 2186 const vfloat3 & operator= (float a) { load(a); return *this; } 2187 2188 /// Return a vfloat3 with all components set to 0.0 2189 static const vfloat3 Zero (); 2190 2191 /// Return a vfloat3 with all components set to 1.0 2192 static const vfloat3 One (); 2193 2194 /// Return a vfloat3 with incremented components (e.g., 0.0,1.0,2.0). 2195 /// Optional argument can give a non-zero starting point and non-1 step. 2196 static const vfloat3 Iota (float start=0.0f, float step=1.0f); 2197 2198 /// Helper: load a single value into all components 2199 void load (float val); 2200 2201 /// Load from an array of 4 values 2202 void load (const float *values); 2203 2204 /// Load from an array of 4 values 2205 void load (const float *values, int n); 2206 2207 /// Load from an array of 4 unsigned short values, convert to float 2208 void load (const unsigned short *values); 2209 2210 /// Load from an array of 4 short values, convert to float 2211 void load (const short *values); 2212 2213 /// Load from an array of 4 unsigned char values, convert to float 2214 void load (const unsigned char *values); 2215 2216 /// Load from an array of 4 char values, convert to float 2217 void load (const char *values); 2218 2219 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 2220 /// Load from an array of 4 half values, convert to float 2221 void load (const half *values); 2222 #endif /* _HALF_H_ or _IMATH_H_ */ 2223 2224 void store (float *values) const; 2225 2226 void store (float *values, int n) const; 2227 2228 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 2229 void store (half *values) const; 2230 #endif 2231 2232 /// Store into an Imath::V3f reference. 2233 void store (Imath::V3f &vec) const; 2234 2235 // Math operators -- define in terms of vfloat3. 2236 friend vfloat3 operator+ (const vfloat3& a, const vfloat3& b); 2237 const vfloat3 & operator+= (const vfloat3& a); 2238 vfloat3 operator- () const; 2239 friend vfloat3 operator- (const vfloat3& a, const vfloat3& b); 2240 const vfloat3 & operator-= (const vfloat3& a); 2241 friend vfloat3 operator* (const vfloat3& a, const vfloat3& b); 2242 friend vfloat3 operator* (const vfloat3& a, float b); 2243 friend vfloat3 operator* (float a, const vfloat3& b); 2244 const vfloat3 & operator*= (const vfloat3& a); 2245 const vfloat3 & operator*= (float a); 2246 friend vfloat3 operator/ (const vfloat3& a, const vfloat3& b); 2247 const vfloat3 & operator/= (const vfloat3& a); 2248 const vfloat3 & operator/= (float a); 2249 2250 /// Square of the length of the vector 2251 float length2() const; 2252 /// Length of the vector 2253 float length() const; 2254 2255 /// Return a normalized version of the vector. 2256 vfloat3 normalized () const; 2257 /// Return a fast, approximate normalized version of the vector. 2258 vfloat3 normalized_fast () const; 2259 /// Normalize in place. 2260 void normalize() { *this = normalized(); } 2261 2262 /// Stream output 2263 friend inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val); 2264 }; 2265 2266 2267 2268 // Per-element math on float3 2269 vfloat3 abs (const vfloat3& a); 2270 vfloat3 sign (const vfloat3& a); 2271 vfloat3 ceil (const vfloat3& a); 2272 vfloat3 floor (const vfloat3& a); 2273 vfloat3 round (const vfloat3& a); 2274 2275 2276 2277 /// SIMD-based 4x4 matrix. This is guaranteed to have memory layout (when 2278 /// not in registers) isomorphic to Imath::M44f. 2279 class matrix44 { 2280 public: 2281 // Uninitialized 2282 OIIO_FORCEINLINE matrix44 () 2283 #ifndef OIIO_SIMD_SSE 2284 : m_mat(Imath::UNINITIALIZED) 2285 #endif 2286 { } 2287 2288 /// Construct from a reference to an Imath::M44f 2289 OIIO_FORCEINLINE explicit matrix44 (const Imath::M44f &M) { 2290 #if OIIO_SIMD_SSE 2291 m_row[0].load (M[0]); 2292 m_row[1].load (M[1]); 2293 m_row[2].load (M[2]); 2294 m_row[3].load (M[3]); 2295 #else 2296 m_mat = M; 2297 #endif 2298 } 2299 2300 /// Construct from a float array 2301 OIIO_FORCEINLINE explicit matrix44 (const float *f) { 2302 #if OIIO_SIMD_SSE 2303 m_row[0].load (f+0); 2304 m_row[1].load (f+4); 2305 m_row[2].load (f+8); 2306 m_row[3].load (f+12); 2307 #else 2308 m_mat = *(const Imath::M44f*)f; 2309 #endif 2310 } 2311 2312 /// Construct from 4 vfloat4 rows 2313 OIIO_FORCEINLINE explicit matrix44 (const vfloat4& a, const vfloat4& b, 2314 const vfloat4& c, const vfloat4& d) { 2315 #if OIIO_SIMD_SSE 2316 m_row[0] = a; m_row[1] = b; m_row[2] = c; m_row[3] = d; 2317 #else 2318 a.store (m_mat[0]); 2319 b.store (m_mat[1]); 2320 c.store (m_mat[2]); 2321 d.store (m_mat[3]); 2322 #endif 2323 } 2324 /// Construct from 4 float[4] rows 2325 OIIO_FORCEINLINE explicit matrix44 (const float *a, const float *b, 2326 const float *c, const float *d) { 2327 #if OIIO_SIMD_SSE 2328 m_row[0].load(a); m_row[1].load(b); m_row[2].load(c); m_row[3].load(d); 2329 #else 2330 memcpy (m_mat[0], a, 4*sizeof(float)); 2331 memcpy (m_mat[1], b, 4*sizeof(float)); 2332 memcpy (m_mat[2], c, 4*sizeof(float)); 2333 memcpy (m_mat[3], d, 4*sizeof(float)); 2334 #endif 2335 } 2336 2337 /// Construct from 16 floats 2338 OIIO_FORCEINLINE matrix44 (float f00, float f01, float f02, float f03, 2339 float f10, float f11, float f12, float f13, 2340 float f20, float f21, float f22, float f23, 2341 float f30, float f31, float f32, float f33) 2342 { 2343 #if OIIO_SIMD_SSE 2344 m_row[0].load (f00, f01, f02, f03); 2345 m_row[1].load (f10, f11, f12, f13); 2346 m_row[2].load (f20, f21, f22, f23); 2347 m_row[3].load (f30, f31, f32, f33); 2348 #else 2349 m_mat[0][0] = f00; m_mat[0][1] = f01; m_mat[0][2] = f02; m_mat[0][3] = f03; 2350 m_mat[1][0] = f10; m_mat[1][1] = f11; m_mat[1][2] = f12; m_mat[1][3] = f13; 2351 m_mat[2][0] = f20; m_mat[2][1] = f21; m_mat[2][2] = f22; m_mat[2][3] = f23; 2352 m_mat[3][0] = f30; m_mat[3][1] = f31; m_mat[3][2] = f32; m_mat[3][3] = f33; 2353 #endif 2354 } 2355 2356 /// Present as an Imath::M44f 2357 const Imath::M44f& M44f() const; 2358 2359 /// Return one row 2360 vfloat4 operator[] (int i) const; 2361 2362 /// Return the transposed matrix 2363 matrix44 transposed () const; 2364 2365 /// Transform 3-point V by 4x4 matrix M. 2366 vfloat3 transformp (const vfloat3 &V) const; 2367 2368 /// Transform 3-vector V by 4x4 matrix M. 2369 vfloat3 transformv (const vfloat3 &V) const; 2370 2371 /// Transform 3-vector V by the transpose of 4x4 matrix M. 2372 vfloat3 transformvT (const vfloat3 &V) const; 2373 2374 friend vfloat4 operator* (const vfloat4 &V, const matrix44& M); 2375 friend vfloat4 operator* (const matrix44& M, const vfloat4 &V); 2376 2377 bool operator== (const matrix44& m) const; 2378 2379 bool operator== (const Imath::M44f& m) const ; 2380 friend bool operator== (const Imath::M44f& a, const matrix44 &b); 2381 2382 bool operator!= (const matrix44& m) const; 2383 2384 bool operator!= (const Imath::M44f& m) const; 2385 friend bool operator!= (const Imath::M44f& a, const matrix44 &b); 2386 2387 /// Return the inverse of the matrix. 2388 matrix44 inverse() const; 2389 2390 /// Stream output 2391 friend inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M); 2392 2393 private: 2394 #if OIIO_SIMD_SSE 2395 vfloat4 m_row[4]; 2396 #else 2397 Imath::M44f m_mat; 2398 #endif 2399 }; 2400 2401 /// Transform 3-point V by 4x4 matrix M. 2402 vfloat3 transformp (const matrix44 &M, const vfloat3 &V); 2403 vfloat3 transformp (const Imath::M44f &M, const vfloat3 &V); 2404 2405 /// Transform 3-vector V by 4x4 matrix M. 2406 vfloat3 transformv (const matrix44 &M, const vfloat3 &V); 2407 vfloat3 transformv (const Imath::M44f &M, const vfloat3 &V); 2408 2409 // Transform 3-vector by the transpose of 4x4 matrix M. 2410 vfloat3 transformvT (const matrix44 &M, const vfloat3 &V); 2411 vfloat3 transformvT (const Imath::M44f &M, const vfloat3 &V); 2412 2413 2414 2415 2416 /// Floating point 8-vector, accelerated by SIMD instructions when 2417 /// available. 2418 class vfloat8 { 2419 public: 2420 static const char* type_name() { return "vfloat8"; } 2421 typedef float value_t; ///< Underlying equivalent scalar value type 2422 enum { elements = 8 }; ///< Number of scalar elements 2423 enum { paddedelements = 8 }; ///< Number of scalar elements for full pad 2424 enum { bits = elements*32 }; ///< Total number of bits 2425 typedef simd_raw_t<float,8>::type simd_t; ///< the native SIMD type used 2426 typedef vfloat8 vfloat_t; ///< SIMD int type 2427 typedef vint8 vint_t; ///< SIMD int type 2428 typedef vbool8 vbool_t; ///< SIMD bool type 2429 typedef vint8 int_t; // old name (deprecated 1.8) 2430 typedef vbool8 bool_t; // old name (deprecated 1.8) 2431 2432 /// Default constructor (contents undefined) 2433 vfloat8 () { } 2434 2435 /// Construct from a single value (store it in all slots) 2436 vfloat8 (float a) { load(a); } 2437 2438 /// Construct from 8 values 2439 vfloat8 (float a, float b, float c, float d, 2440 float e, float f, float g, float h) { load(a,b,c,d,e,f,g,h); } 2441 2442 /// Construct from a pointer to 8 values 2443 vfloat8 (const float *f) { load (f); } 2444 2445 /// Copy construct from another vfloat8 2446 vfloat8 (const vfloat8 &other) { m_simd = other.m_simd; } 2447 2448 /// Construct from an int vector (promoting all components to float) 2449 explicit vfloat8 (const vint8& ival); 2450 2451 /// Construct from two vfloat4's 2452 vfloat8 (const vfloat4 &lo, const vfloat4 &hi); 2453 2454 /// Construct from the underlying SIMD type 2455 vfloat8 (const simd_t& m) : m_simd(m) { } 2456 2457 /// Return the raw SIMD type 2458 operator simd_t () const { return m_simd; } 2459 simd_t simd () const { return m_simd; } 2460 simd_t& simd () { return m_simd; } 2461 2462 /// Return a pointer to the underlying scalar type 2463 const value_t* data () const { return (const value_t*)this; } 2464 value_t* data () { return (value_t*)this; } 2465 2466 /// Construct from a pointer to unsigned short values 2467 explicit vfloat8 (const unsigned short *vals) { load(vals); } 2468 2469 /// Construct from a pointer to short values 2470 explicit vfloat8 (const short *vals) { load(vals); } 2471 2472 /// Construct from a pointer to unsigned char values 2473 explicit vfloat8 (const unsigned char *vals) { load(vals); } 2474 2475 /// Construct from a pointer to char values 2476 explicit vfloat8 (const char *vals) { load(vals); } 2477 2478 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 2479 /// Construct from a pointer to half (16 bit float) values 2480 explicit vfloat8 (const half *vals) { load(vals); } 2481 #endif 2482 2483 /// Assign a single value to all components 2484 const vfloat8& operator= (float a) { load(a); return *this; } 2485 2486 /// Assign a vfloat8 2487 const vfloat8& operator= (vfloat8 other) { 2488 m_simd = other.m_simd; 2489 return *this; 2490 } 2491 2492 /// Return a vfloat8 with all components set to 0.0 2493 static const vfloat8 Zero (); 2494 2495 /// Return a vfloat8 with all components set to 1.0 2496 static const vfloat8 One (); 2497 2498 /// Return a vfloat8 with incremented components (e.g., 0,1,2,3,...) 2499 /// Optional argument can give a non-zero starting point and non-1 step. 2500 static const vfloat8 Iota (float start=0.0f, float step=1.0f); 2501 2502 /// Set all components to 0.0 2503 void clear (); 2504 2505 /// Component access (get) 2506 float operator[] (int i) const; 2507 /// Component access (set) 2508 float& operator[] (int i); 2509 2510 /// Component access (set). 2511 void setcomp (int i, float value); 2512 2513 value_t x () const; 2514 value_t y () const; 2515 value_t z () const; 2516 value_t w () const; 2517 void set_x (value_t val); 2518 void set_y (value_t val); 2519 void set_z (value_t val); 2520 void set_w (value_t val); 2521 2522 /// Extract the lower precision vfloat4 2523 vfloat4 lo () const; 2524 2525 /// Extract the higher precision vfloat4 2526 vfloat4 hi () const; 2527 2528 /// Helper: load a single value into all components 2529 void load (float val); 2530 2531 /// Helper: load 8 values 2532 void load (float a, float b, float c, float d, 2533 float e, float f, float g, float h); 2534 2535 /// Load from an array of values 2536 void load (const float *values); 2537 2538 /// Load from a partial array of <=8 values. Unassigned values are 2539 /// undefined. 2540 void load (const float *values, int n); 2541 2542 /// Load from an array of 8 unsigned short values, convert to float 2543 void load (const unsigned short *values); 2544 2545 /// Load from an array of 8 short values, convert to float 2546 void load (const short *values); 2547 2548 /// Load from an array of 8 unsigned char values, convert to float 2549 void load (const unsigned char *values); 2550 2551 /// Load from an array of 8 char values, convert to float 2552 void load (const char *values); 2553 2554 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 2555 /// Load from an array of 8 half values, convert to float 2556 void load (const half *values); 2557 #endif /* _HALF_H_ or _IMATH_H_ */ 2558 2559 void store (float *values) const; 2560 2561 /// Store the first n values into memory 2562 void store (float *values, int n) const; 2563 2564 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 2565 void store (half *values) const; 2566 #endif 2567 2568 /// Masked load -- read from values[] where mask is 1, load zero where 2569 /// mask is 0. 2570 void load_mask (int mask, const value_t *values); 2571 void load_mask (const vbool_t& mask, const value_t *values); 2572 2573 /// Masked store -- write to values[] where mask is enabled, don't 2574 /// touch values[] where it's not. 2575 void store_mask (int mask, value_t *values) const; 2576 void store_mask (const vbool_t& mask, value_t *values) const; 2577 2578 /// Load values from addresses (char*)basepatr + vindex[i]*scale 2579 template<int scale=4> 2580 void gather (const value_t *baseptr, const vint_t& vindex); 2581 template<int scale=4> 2582 // Fastest way to fill with all 1 bits is to cmp any value to itself. 2583 void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex); 2584 template<int scale=4> 2585 void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex); 2586 2587 /// Store values at addresses (char*)basepatr + vindex[i]*scale 2588 template<int scale=4> 2589 void scatter (value_t *baseptr, const vint_t& vindex) const; 2590 /// Scatter elements defined by the mask 2591 template<int scale=4> 2592 void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const; 2593 template<int scale=4> 2594 void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const; 2595 2596 // Arithmetic operators (component-by-component) 2597 friend vfloat8 operator+ (const vfloat8& a, const vfloat8& b); 2598 friend vfloat8 operator- (const vfloat8& a); 2599 friend vfloat8 operator- (const vfloat8& a, const vfloat8& b); 2600 friend vfloat8 operator* (const vfloat8& a, const vfloat8& b); 2601 friend vfloat8 operator* (const vfloat8& a, float b); 2602 friend vfloat8 operator* (float a, const vfloat8& b); 2603 friend vfloat8 operator/ (const vfloat8& a, const vfloat8& b); 2604 friend vfloat8 operator% (const vfloat8& a, const vfloat8& b); 2605 friend const vfloat8 & operator+= (vfloat8& a, const vfloat8& b); 2606 friend const vfloat8 & operator-= (vfloat8& a, const vfloat8& b); 2607 friend const vfloat8 & operator*= (vfloat8& a, const vfloat8& b); 2608 friend const vfloat8 & operator/= (vfloat8& a, const vfloat8& b); 2609 2610 // Comparison operations 2611 friend vbool8 operator== (const vfloat8& a, const vfloat8& b); 2612 friend vbool8 operator!= (const vfloat8& a, const vfloat8& b); 2613 friend vbool8 operator< (const vfloat8& a, const vfloat8& b); 2614 friend vbool8 operator> (const vfloat8& a, const vfloat8& b); 2615 friend vbool8 operator>= (const vfloat8& a, const vfloat8& b); 2616 friend vbool8 operator<= (const vfloat8& a, const vfloat8& b); 2617 2618 // Some oddball items that are handy 2619 2620 /// Stream output 2621 friend inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val); 2622 2623 protected: 2624 // The actual data representation 2625 union { 2626 simd_t m_simd; 2627 value_t m_val[paddedelements]; 2628 vfloat4 m_4[2]; 2629 }; 2630 }; 2631 2632 2633 /// Helper: shuffle/swizzle with constant (templated) indices. 2634 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c) 2635 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 2636 vfloat8 shuffle (const vfloat8& a); 2637 2638 /// shuffle<i>(a) is the same as shuffle<i,i,i,i,...>(a) 2639 template<int i> vfloat8 shuffle (const vfloat8& a); 2640 2641 /// Helper: as rapid as possible extraction of one component, when the 2642 /// index is fixed. 2643 template<int i> float extract (const vfloat8& a); 2644 2645 /// Helper: substitute val for a[i] 2646 template<int i> vfloat8 insert (const vfloat8& a, float val); 2647 2648 /// The sum of all components, returned in all components. 2649 vfloat8 vreduce_add (const vfloat8& v); 2650 2651 /// The sum of all components, returned as a scalar. 2652 float reduce_add (const vfloat8& v); 2653 2654 /// Return the float dot (inner) product of a and b in every component. 2655 vfloat8 vdot (const vfloat8 &a, const vfloat8 &b); 2656 2657 /// Return the float dot (inner) product of a and b. 2658 float dot (const vfloat8 &a, const vfloat8 &b); 2659 2660 /// Return the float 3-component dot (inner) product of a and b in 2661 /// all components. 2662 vfloat8 vdot3 (const vfloat8 &a, const vfloat8 &b); 2663 2664 /// Return the float 3-component dot (inner) product of a and b. 2665 float dot3 (const vfloat8 &a, const vfloat8 &b); 2666 2667 /// Use a bool mask to select between components of a (if mask[i] is false) 2668 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i]. 2669 vfloat8 blend (const vfloat8& a, const vfloat8& b, const vbool8& mask); 2670 2671 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if 2672 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to 2673 /// blend(0,a,mask). 2674 vfloat8 blend0 (const vfloat8& a, const vbool8& mask); 2675 2676 /// Use a bool mask to select between components of a (if mask[i] is false) 2677 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to 2678 /// blend(0,a,!mask), or blend(a,0,mask). 2679 vfloat8 blend0not (const vfloat8& a, const vbool8& mask); 2680 2681 /// "Safe" divide of vfloat8/vfloat8 -- for any component of the divisor 2682 /// that is 0, return 0 rather than Inf. 2683 vfloat8 safe_div (const vfloat8 &a, const vfloat8 &b); 2684 2685 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a 2686 /// synonym for blend with arguments rearranged, but this is more clear 2687 /// because the arguments are symmetric to scalar (cond ? a : b). 2688 vfloat8 select (const vbool8& mask, const vfloat8& a, const vfloat8& b); 2689 2690 // Per-element math 2691 vfloat8 abs (const vfloat8& a); ///< absolute value (float) 2692 vfloat8 sign (const vfloat8& a); ///< 1.0 when value >= 0, -1 when negative 2693 vfloat8 ceil (const vfloat8& a); 2694 vfloat8 floor (const vfloat8& a); 2695 vint8 ifloor (const vfloat8& a); ///< (int)floor 2696 inline vint8 floori (const vfloat8& a) { return ifloor(a); } // DEPRECATED(1.8) alias 2697 2698 /// Per-element round to nearest integer. 2699 /// CAVEAT: the rounding when mid-way between integers may differ depending 2700 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even 2701 /// integer) but std::round() says to round away from 0 regardless of 2702 /// current rounding mode (but that is multiple instructions on x64). 2703 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly 2704 /// match std::round(). 2705 vfloat8 round (const vfloat8& a); 2706 2707 /// Per-element round to nearest integer (equivalent to vint(round(a))). 2708 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from 2709 /// C++ std::rint() which says to use the current rounding mode. 2710 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly 2711 /// match std::rint(). 2712 vint8 rint (const vfloat8& a); 2713 2714 vfloat8 rcp_fast (const vfloat8 &a); ///< Fast, approximate 1/a 2715 vfloat8 sqrt (const vfloat8 &a); 2716 vfloat8 rsqrt (const vfloat8 &a); ///< Fully accurate 1/sqrt 2717 vfloat8 rsqrt_fast (const vfloat8 &a); ///< Fast, approximate 1/sqrt 2718 vfloat8 min (const vfloat8& a, const vfloat8& b); ///< Per-element min 2719 vfloat8 max (const vfloat8& a, const vfloat8& b); ///< Per-element max 2720 // vfloat8 exp (const vfloat8& v); // See template with vfloat4 2721 // vfloat8 log (const vfloat8& v); // See template with vfloat4 2722 2723 /// andnot(a,b) returns ((~a) & b) 2724 vfloat8 andnot (const vfloat8& a, const vfloat8& b); 2725 2726 // Fused multiply and add (or subtract): 2727 vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b + c 2728 vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b - c 2729 vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b + c 2730 vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b - c 2731 2732 2733 2734 /// Floating point 16-vector, accelerated by SIMD instructions when 2735 /// available. 2736 class vfloat16 { 2737 public: 2738 static const char* type_name() { return "vfloat16"; } 2739 typedef float value_t; ///< Underlying equivalent scalar value type 2740 enum { elements = 16 }; ///< Number of scalar elements 2741 enum { paddedelements = 16 }; ///< Number of scalar elements for full pad 2742 enum { bits = elements*32 }; ///< Total number of bits 2743 typedef simd_raw_t<float,16>::type simd_t; ///< the native SIMD type used 2744 typedef vfloat16 vfloat_t; ///< SIMD int type 2745 typedef vint16 vint_t; ///< SIMD int type 2746 typedef vbool16 vbool_t; ///< SIMD bool type 2747 typedef vint16 int_t; // old name (deprecated 1.8) 2748 typedef vbool16 bool_t; // old name (deprecated 1.8) 2749 2750 /// Default constructor (contents undefined) 2751 vfloat16 () { } 2752 2753 /// Construct from a single value (store it in all slots) 2754 vfloat16 (float a) { load(a); } 2755 2756 /// Construct from 16 values 2757 vfloat16 (float v0, float v1, float v2, float v3, 2758 float v4, float v5, float v6, float v7, 2759 float v8, float v9, float v10, float v11, 2760 float v12, float v13, float v14, float v15); 2761 2762 /// Construct from a pointer to 16 values 2763 vfloat16 (const float *f) { load (f); } 2764 2765 /// Copy construct from another vfloat16 2766 vfloat16 (const vfloat16 &other) { m_simd = other.m_simd; } 2767 2768 /// Construct from an int vector (promoting all components to float) 2769 explicit vfloat16 (const vint16& ival); 2770 2771 /// Construct from two vfloat8's 2772 vfloat16 (const vfloat8 &lo, const vfloat8 &hi); 2773 2774 /// Construct from four vfloat4's 2775 vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d); 2776 2777 /// Construct from the underlying SIMD type 2778 vfloat16 (const simd_t& m) : m_simd(m) { } 2779 2780 /// Return the raw SIMD type 2781 operator simd_t () const { return m_simd; } 2782 simd_t simd () const { return m_simd; } 2783 simd_t& simd () { return m_simd; } 2784 2785 /// Return a pointer to the underlying scalar type 2786 const value_t* data () const { return (const value_t*)this; } 2787 value_t* data () { return (value_t*)this; } 2788 2789 /// Construct from a pointer to unsigned short values 2790 explicit vfloat16 (const unsigned short *vals) { load(vals); } 2791 2792 /// Construct from a pointer to short values 2793 explicit vfloat16 (const short *vals) { load(vals); } 2794 2795 /// Construct from a pointer to unsigned char values 2796 explicit vfloat16 (const unsigned char *vals) { load(vals); } 2797 2798 /// Construct from a pointer to char values 2799 explicit vfloat16 (const char *vals) { load(vals); } 2800 2801 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 2802 /// Construct from a pointer to half (16 bit float) values 2803 explicit vfloat16 (const half *vals) { load(vals); } 2804 #endif 2805 2806 /// Assign a single value to all components 2807 const vfloat16& operator= (float a) { load(a); return *this; } 2808 2809 /// Assign a vfloat16 2810 const vfloat16& operator= (vfloat16 other) { 2811 m_simd = other.m_simd; 2812 return *this; 2813 } 2814 2815 /// Return a vfloat16 with all components set to 0.0 2816 static const vfloat16 Zero (); 2817 2818 /// Return a vfloat16 with all components set to 1.0 2819 static const vfloat16 One (); 2820 2821 /// Return a vfloat16 with incremented components (e.g., 0,1,2,3,...) 2822 /// Optional argument can give a non-zero starting point and non-1 step. 2823 static const vfloat16 Iota (float start=0.0f, float step=1.0f); 2824 2825 /// Set all components to 0.0 2826 void clear (); 2827 2828 /// Component access (get) 2829 float operator[] (int i) const; 2830 /// Component access (set) 2831 float& operator[] (int i); 2832 2833 /// Component access (set). 2834 void setcomp (int i, float value); 2835 2836 value_t x () const; 2837 value_t y () const; 2838 value_t z () const; 2839 value_t w () const; 2840 void set_x (value_t val); 2841 void set_y (value_t val); 2842 void set_z (value_t val); 2843 void set_w (value_t val); 2844 2845 /// Extract the lower precision vfloat8 2846 vfloat8 lo () const; 2847 2848 /// Extract the higher precision vfloat8 2849 vfloat8 hi () const; 2850 2851 /// Helper: load a single value into all components 2852 void load (float val); 2853 2854 /// Load separate values into each component. 2855 void load (float v0, float v1, float v2, float v3, 2856 float v4, float v5, float v6, float v7, 2857 float v8, float v9, float v10, float v11, 2858 float v12, float v13, float v14, float v15); 2859 2860 /// Load from an array of values 2861 void load (const float *values); 2862 2863 /// Load from a partial array of <=16 values. Unassigned values are 2864 /// undefined. 2865 void load (const float *values, int n); 2866 2867 /// Load from an array of 16 unsigned short values, convert to float 2868 void load (const unsigned short *values); 2869 2870 /// Load from an array of 16 short values, convert to float 2871 void load (const short *values); 2872 2873 /// Load from an array of 16 unsigned char values, convert to float 2874 void load (const unsigned char *values); 2875 2876 /// Load from an array of 16 char values, convert to float 2877 void load (const char *values); 2878 2879 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 2880 /// Load from an array of 16 half values, convert to float 2881 void load (const half *values); 2882 #endif /* _HALF_H_ or _IMATH_H_ */ 2883 2884 void store (float *values) const; 2885 2886 /// Store the first n values into memory 2887 void store (float *values, int n) const; 2888 2889 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 2890 void store (half *values) const; 2891 #endif 2892 2893 /// Masked load -- read from values[] where mask is 1, load zero where 2894 /// mask is 0. 2895 void load_mask (const vbool_t &mask, const value_t *values); 2896 void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); } 2897 2898 /// Masked store -- write to values[] where mask is enabled, don't 2899 /// touch values[] where it's not. 2900 void store_mask (const vbool_t &mask, value_t *values) const; 2901 void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); } 2902 2903 /// Load values from addresses (char*)basepatr + vindex[i]*scale 2904 template<int scale=4> 2905 void gather (const value_t *baseptr, const vint_t& vindex); 2906 /// Gather elements defined by the mask, leave others unchanged. 2907 template<int scale=4> 2908 void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex); 2909 template<int scale=4> 2910 void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) { 2911 gather_mask<scale> (vbool_t(mask), baseptr, vindex); 2912 } 2913 2914 /// Store values at addresses (char*)basepatr + vindex[i]*scale 2915 template<int scale=4> 2916 void scatter (value_t *baseptr, const vint_t& vindex) const; 2917 /// Scatter elements defined by the mask 2918 template<int scale=4> 2919 void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const; 2920 template<int scale=4> 2921 void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const { 2922 scatter_mask<scale> (vbool_t(mask), baseptr, vindex); 2923 } 2924 2925 // Arithmetic operators (component-by-component) 2926 friend vfloat16 operator+ (const vfloat16& a, const vfloat16& b); 2927 friend vfloat16 operator- (const vfloat16& a); 2928 friend vfloat16 operator- (const vfloat16& a, const vfloat16& b); 2929 friend vfloat16 operator* (const vfloat16& a, const vfloat16& b); 2930 friend vfloat16 operator* (const vfloat16& a, float b); 2931 friend vfloat16 operator* (float a, const vfloat16& b); 2932 friend vfloat16 operator/ (const vfloat16& a, const vfloat16& b); 2933 friend vfloat16 operator% (const vfloat16& a, const vfloat16& b); 2934 friend const vfloat16 & operator+= (vfloat16& a, const vfloat16& b); 2935 friend const vfloat16 & operator-= (vfloat16& a, const vfloat16& b); 2936 friend const vfloat16 & operator*= (vfloat16& a, const vfloat16& b); 2937 friend const vfloat16 & operator/= (vfloat16& a, const vfloat16& b); 2938 2939 // Comparison operations 2940 friend vbool16 operator== (const vfloat16& a, const vfloat16& b); 2941 friend vbool16 operator!= (const vfloat16& a, const vfloat16& b); 2942 friend vbool16 operator< (const vfloat16& a, const vfloat16& b); 2943 friend vbool16 operator> (const vfloat16& a, const vfloat16& b); 2944 friend vbool16 operator>= (const vfloat16& a, const vfloat16& b); 2945 friend vbool16 operator<= (const vfloat16& a, const vfloat16& b); 2946 2947 // Some oddball items that are handy 2948 2949 /// Stream output 2950 friend inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val); 2951 2952 protected: 2953 // The actual data representation 2954 union { 2955 simd_t m_simd; 2956 value_t m_val[paddedelements]; 2957 vfloat8 m_8[2]; 2958 }; 2959 }; 2960 2961 2962 /// Shuffle groups of 4 2963 template<int i0, int i1, int i2, int i3> 2964 vfloat16 shuffle4 (const vfloat16& a); 2965 2966 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a) 2967 template<int i> vfloat16 shuffle4 (const vfloat16& a); 2968 2969 /// Shuffle within each group of 4 2970 template<int i0, int i1, int i2, int i3> 2971 vfloat16 shuffle (const vfloat16& a); 2972 2973 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a) 2974 template<int i> vfloat16 shuffle (const vfloat16& a); 2975 2976 /// Helper: as rapid as possible extraction of one component, when the 2977 /// index is fixed. 2978 template<int i> float extract (const vfloat16& a); 2979 2980 /// Helper: substitute val for a[i] 2981 template<int i> vfloat16 insert (const vfloat16& a, float val); 2982 2983 /// The sum of all components, returned in all components. 2984 vfloat16 vreduce_add (const vfloat16& v); 2985 2986 /// The sum of all components, returned as a scalar. 2987 float reduce_add (const vfloat16& v); 2988 2989 /// Use a bool mask to select between components of a (if mask[i] is false) 2990 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i]. 2991 vfloat16 blend (const vfloat16& a, const vfloat16& b, const vbool4& mask); 2992 2993 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if 2994 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to 2995 /// blend(0,a,mask). 2996 vfloat16 blend0 (const vfloat16& a, const vbool4& mask); 2997 2998 /// Use a bool mask to select between components of a (if mask[i] is false) 2999 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to 3000 /// blend(0,a,!mask), or blend(a,0,mask). 3001 vfloat16 blend0not (const vfloat16& a, const vbool4& mask); 3002 3003 /// "Safe" divide of vfloat16/vfloat16 -- for any component of the divisor 3004 /// that is 0, return 0 rather than Inf. 3005 vfloat16 safe_div (const vfloat16 &a, const vfloat16 &b); 3006 3007 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a 3008 /// synonym for blend with arguments rearranged, but this is more clear 3009 /// because the arguments are symmetric to scalar (cond ? a : b). 3010 vfloat16 select (const vbool16& mask, const vfloat16& a, const vfloat16& b); 3011 3012 // Per-element math 3013 vfloat16 abs (const vfloat16& a); ///< absolute value (float) 3014 vfloat16 sign (const vfloat16& a); ///< 1.0 when value >= 0, -1 when negative 3015 vfloat16 ceil (const vfloat16& a); 3016 vfloat16 floor (const vfloat16& a); 3017 vint16 ifloor (const vfloat16& a); ///< (int)floor 3018 inline vint16 floori (const vfloat16& a) { return ifloor(a); } // DEPRECATED(1.8) alias 3019 3020 /// Per-element round to nearest integer. 3021 /// CAVEAT: the rounding when mid-way between integers may differ depending 3022 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even 3023 /// integer) but std::round() says to round away from 0 regardless of 3024 /// current rounding mode (but that is multiple instructions on x64). 3025 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly 3026 /// match std::round(). 3027 vfloat16 round (const vfloat16& a); 3028 3029 /// Per-element round to nearest integer (equivalent to vint(round(a))). 3030 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from 3031 /// C++ std::rint() which says to use the current rounding mode. 3032 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly 3033 /// match std::rint(). 3034 vint16 rint (const vfloat16& a); 3035 3036 vfloat16 rcp_fast (const vfloat16 &a); ///< Fast, approximate 1/a 3037 vfloat16 sqrt (const vfloat16 &a); 3038 vfloat16 rsqrt (const vfloat16 &a); ///< Fully accurate 1/sqrt 3039 vfloat16 rsqrt_fast (const vfloat16 &a); ///< Fast, approximate 1/sqrt 3040 vfloat16 min (const vfloat16& a, const vfloat16& b); ///< Per-element min 3041 vfloat16 max (const vfloat16& a, const vfloat16& b); ///< Per-element max 3042 // vfloat16 exp (const vfloat16& v); // See template with vfloat4 3043 // vfloat16 log (const vfloat16& v); // See template with vfloat4 3044 3045 /// andnot(a,b) returns ((~a) & b) 3046 vfloat16 andnot (const vfloat16& a, const vfloat16& b); 3047 3048 // Fused multiply and add (or subtract): 3049 vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b + c 3050 vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b - c 3051 vfloat16 nmadd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b + c 3052 vfloat16 nmsub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b - c 3053 3054 3055 3056 // Odds and ends, other CPU hardware tricks 3057 3058 // Try to set the flush_zero_mode CPU flag on x86. Return true if we are 3059 // able, otherwise false (because it's not available on that platform, 3060 // or because it's gcc 4.8 which has a bug that lacks this intrinsic). 3061 inline bool set_flush_zero_mode (bool on) { 3062 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900) 3063 _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF); 3064 return true; 3065 #endif 3066 return false; 3067 } 3068 3069 // Try to set the denorms_zero_mode CPU flag on x86. Return true if we are 3070 // able, otherwise false (because it's not available on that platform, 3071 // or because it's gcc 4.8 which has a bug that lacks this intrinsic). 3072 inline bool set_denorms_zero_mode (bool on) { 3073 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900) 3074 _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF); 3075 return true; 3076 #endif 3077 return false; 3078 } 3079 3080 // Get the flush_zero_mode CPU flag on x86. 3081 inline bool get_flush_zero_mode () { 3082 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900) 3083 return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON; 3084 #endif 3085 return false; 3086 } 3087 3088 // Get the denorms_zero_mode CPU flag on x86. 3089 inline bool get_denorms_zero_mode () { 3090 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900) 3091 return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON; 3092 #endif 3093 return false; 3094 } 3095 3096 3097 3098 3099 3100 3101 ////////////////////////////////////////////////////////////////////////// 3102 ////////////////////////////////////////////////////////////////////////// 3103 // 3104 // Gory implementation details follow. 3105 // 3106 // ^^^ All declarations and documention is above ^^^ 3107 // 3108 // vvv Below is the implementation, often considerably cluttered with 3109 // #if's for each architeture, and unapologitic use of intrinsics and 3110 // every manner of dirty trick we can think of to make things fast. 3111 // Some of this isn't pretty. We won't recapitulate comments or 3112 // documentation of what the functions are supposed to do, please 3113 // consult the declarations above for that. 3114 // 3115 // Here be dragons. 3116 // 3117 ////////////////////////////////////////////////////////////////////////// 3118 ////////////////////////////////////////////////////////////////////////// 3119 3120 3121 3122 ////////////////////////////////////////////////////////////////////// 3123 // vbool4 implementation 3124 3125 3126 OIIO_FORCEINLINE int vbool4::operator[] (int i) const { 3127 OIIO_DASSERT(i >= 0 && i < elements); 3128 #if OIIO_SIMD_SSE 3129 return ((_mm_movemask_ps(m_simd) >> i) & 1) ? -1 : 0; 3130 #else 3131 return m_val[i]; 3132 #endif 3133 } 3134 3135 OIIO_FORCEINLINE int& vbool4::operator[] (int i) { 3136 OIIO_DASSERT(i >= 0 && i < elements); 3137 return m_val[i]; 3138 } 3139 3140 3141 OIIO_FORCEINLINE void vbool4::setcomp (int i, bool value) { 3142 OIIO_DASSERT(i >= 0 && i < elements); 3143 m_val[i] = value ? -1 : 0; 3144 } 3145 3146 3147 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool4& a) { 3148 cout << a[0]; 3149 for (int i = 1; i < a.elements; ++i) 3150 cout << ' ' << a[i]; 3151 return cout; 3152 } 3153 3154 3155 OIIO_FORCEINLINE void vbool4::load (bool a) { 3156 #if OIIO_SIMD_SSE 3157 m_simd = _mm_castsi128_ps(_mm_set1_epi32(-int(a))); 3158 #elif OIIO_SIMD_NEON 3159 m_simd = vdupq_n_u32(a ? 0xffffffff : 0); 3160 #else 3161 int val = -int(a); 3162 SIMD_CONSTRUCT (val); 3163 #endif 3164 } 3165 3166 3167 OIIO_FORCEINLINE void vbool4::load (bool a, bool b, bool c, bool d) { 3168 #if OIIO_SIMD_SSE 3169 // N.B. -- we need to reverse the order because of our convention 3170 // of storing a,b,c,d in the same order in memory. 3171 m_simd = _mm_castsi128_ps(_mm_set_epi32(-int(d), -int(c), -int(b), -int(a))); 3172 // #elif OIIO_SIMD_NEON 3173 // FIXME 3174 #else 3175 m_val[0] = -int(a); 3176 m_val[1] = -int(b); 3177 m_val[2] = -int(c); 3178 m_val[3] = -int(d); 3179 #endif 3180 } 3181 3182 OIIO_FORCEINLINE vbool4::vbool4 (const bool *a) { 3183 load (a[0], a[1], a[2], a[3]); 3184 } 3185 3186 OIIO_FORCEINLINE const vbool4& vbool4::operator= (const vbool4 & other) { 3187 m_simd = other.m_simd; 3188 return *this; 3189 } 3190 3191 3192 OIIO_FORCEINLINE int vbool4::bitmask () const { 3193 #if OIIO_SIMD_SSE 3194 return _mm_movemask_ps(m_simd); 3195 #else 3196 int r = 0; 3197 for (int i = 0; i < elements; ++i) 3198 if (m_val[i]) 3199 r |= 1<<i; 3200 return r; 3201 #endif 3202 } 3203 3204 3205 OIIO_FORCEINLINE vbool4 3206 vbool4::from_bitmask (int bitmask) { 3207 // I think this is a fast conversion from int bitmask to vbool4 3208 return (vint4::Giota() & vint4(bitmask)) != vint4::Zero(); 3209 } 3210 3211 3212 OIIO_FORCEINLINE void vbool4::clear () { 3213 #if OIIO_SIMD_SSE 3214 m_simd = _mm_setzero_ps(); 3215 #else 3216 *this = false; 3217 #endif 3218 } 3219 3220 3221 OIIO_FORCEINLINE const vbool4 vbool4::False () { 3222 #if OIIO_SIMD_SSE 3223 return _mm_setzero_ps(); 3224 #else 3225 return false; 3226 #endif 3227 } 3228 3229 OIIO_FORCEINLINE const vbool4 vbool4::True () { 3230 // Fastest way to fill with all 1 bits is to cmp any value to itself. 3231 #if OIIO_SIMD_SSE 3232 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000) 3233 __m128i anyval = _mm_undefined_si128(); 3234 # else 3235 __m128i anyval = _mm_setzero_si128(); 3236 # endif 3237 return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval)); 3238 #else 3239 return true; 3240 #endif 3241 } 3242 3243 OIIO_FORCEINLINE void vbool4::store (bool *values) const { 3244 SIMD_DO (values[i] = m_val[i] ? true : false); 3245 } 3246 3247 OIIO_FORCEINLINE void vbool4::store (bool *values, int n) const { 3248 OIIO_DASSERT (n >= 0 && n <= elements); 3249 for (int i = 0; i < n; ++i) 3250 values[i] = m_val[i] ? true : false; 3251 } 3252 3253 3254 3255 OIIO_FORCEINLINE vbool4 operator! (const vbool4 & a) { 3256 #if OIIO_SIMD_SSE 3257 return _mm_xor_ps (a.simd(), vbool4::True()); 3258 #elif OIIO_SIMD_NEON 3259 return vmvnq_u32(a.simd()); 3260 #else 3261 SIMD_RETURN (vbool4, a[i] ^ (-1)); 3262 #endif 3263 } 3264 3265 OIIO_FORCEINLINE vbool4 operator& (const vbool4 & a, const vbool4 & b) { 3266 #if OIIO_SIMD_SSE 3267 return _mm_and_ps (a.simd(), b.simd()); 3268 #elif OIIO_SIMD_NEON 3269 return vandq_u32(a.simd(), b.simd()); 3270 #else 3271 SIMD_RETURN (vbool4, a[i] & b[i]); 3272 #endif 3273 } 3274 3275 OIIO_FORCEINLINE vbool4 operator| (const vbool4 & a, const vbool4 & b) { 3276 #if OIIO_SIMD_SSE 3277 return _mm_or_ps (a.simd(), b.simd()); 3278 #elif OIIO_SIMD_NEON 3279 return vorrq_u32(a.simd(), b.simd()); 3280 #else 3281 SIMD_RETURN (vbool4, a[i] | b[i]); 3282 #endif 3283 } 3284 3285 OIIO_FORCEINLINE vbool4 operator^ (const vbool4& a, const vbool4& b) { 3286 #if OIIO_SIMD_SSE 3287 return _mm_xor_ps (a.simd(), b.simd()); 3288 #elif OIIO_SIMD_NEON 3289 return veorq_u32(a.simd(), b.simd()); 3290 #else 3291 SIMD_RETURN (vbool4, a[i] ^ b[i]); 3292 #endif 3293 } 3294 3295 3296 OIIO_FORCEINLINE const vbool4& operator&= (vbool4& a, const vbool4 &b) { 3297 return a = a & b; 3298 } 3299 3300 OIIO_FORCEINLINE const vbool4& operator|= (vbool4& a, const vbool4& b) { 3301 return a = a | b; 3302 } 3303 3304 OIIO_FORCEINLINE const vbool4& operator^= (vbool4& a, const vbool4& b) { 3305 return a = a ^ b; 3306 } 3307 3308 OIIO_FORCEINLINE vbool4 operator~ (const vbool4& a) { 3309 #if OIIO_SIMD_SSE 3310 // Fastest way to bit-complement in SSE is to xor with 0xffffffff. 3311 return _mm_xor_ps (a.simd(), vbool4::True()); 3312 #elif OIIO_SIMD_NEON 3313 return vmvnq_u32(a.m_simd); 3314 #else 3315 SIMD_RETURN (vbool4, ~a[i]); 3316 #endif 3317 } 3318 3319 OIIO_FORCEINLINE vbool4 operator== (const vbool4 & a, const vbool4 & b) { 3320 #if OIIO_SIMD_SSE 3321 return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b))); 3322 #elif OIIO_SIMD_NEON 3323 return vceqq_u32 (a.m_simd, b.m_simd); 3324 #else 3325 SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0); 3326 #endif 3327 } 3328 3329 OIIO_FORCEINLINE vbool4 operator!= (const vbool4 & a, const vbool4 & b) { 3330 #if OIIO_SIMD_SSE 3331 return _mm_xor_ps (a, b); 3332 #elif OIIO_SIMD_NEON 3333 return !(a == b); 3334 #else 3335 SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0); 3336 #endif 3337 } 3338 3339 3340 3341 3342 #if OIIO_SIMD_SSE 3343 // Shuffling. Use like this: x = shuffle<3,2,1,0>(b) 3344 template<int i0, int i1, int i2, int i3> 3345 OIIO_FORCEINLINE __m128i shuffle_sse (__m128i v) { 3346 return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); 3347 } 3348 #endif 3349 3350 #if OIIO_SIMD_SSE >= 3 3351 // SSE3 has intrinsics for a few special cases 3352 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 0, 2, 2> (__m128i a) { 3353 return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(a))); 3354 } 3355 template<> OIIO_FORCEINLINE __m128i shuffle_sse<1, 1, 3, 3> (__m128i a) { 3356 return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(a))); 3357 } 3358 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 1, 0, 1> (__m128i a) { 3359 return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(a))); 3360 } 3361 #endif 3362 3363 #if OIIO_SIMD_SSE 3364 template<int i0, int i1, int i2, int i3> 3365 OIIO_FORCEINLINE __m128 shuffle_sse (__m128 a) { 3366 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0))); 3367 } 3368 #endif 3369 3370 #if OIIO_SIMD_SSE >= 3 3371 // SSE3 has intrinsics for a few special cases 3372 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 0, 2, 2> (__m128 a) { 3373 return _mm_moveldup_ps(a); 3374 } 3375 template<> OIIO_FORCEINLINE __m128 shuffle_sse<1, 1, 3, 3> (__m128 a) { 3376 return _mm_movehdup_ps(a); 3377 } 3378 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 1, 0, 1> (__m128 a) { 3379 return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a))); 3380 } 3381 #endif 3382 3383 3384 /// Helper: shuffle/swizzle with constant (templated) indices. 3385 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c) 3386 template<int i0, int i1, int i2, int i3> 3387 OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) { 3388 #if OIIO_SIMD_SSE 3389 return shuffle_sse<i0,i1,i2,i3> (a.simd()); 3390 #else 3391 return vbool4 (a[i0], a[i1], a[i2], a[i3]); 3392 #endif 3393 } 3394 3395 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a) 3396 template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) { 3397 return shuffle<i,i,i,i>(a); 3398 } 3399 3400 3401 /// Helper: as rapid as possible extraction of one component, when the 3402 /// index is fixed. 3403 template<int i> 3404 OIIO_FORCEINLINE bool extract (const vbool4& a) { 3405 #if OIIO_SIMD_SSE >= 4 3406 return _mm_extract_epi32(_mm_castps_si128(a.simd()), i); // SSE4.1 only 3407 #else 3408 return a[i]; 3409 #endif 3410 } 3411 3412 /// Helper: substitute val for a[i] 3413 template<int i> 3414 OIIO_FORCEINLINE vbool4 insert (const vbool4& a, bool val) { 3415 #if OIIO_SIMD_SSE >= 4 3416 int ival = -int(val); 3417 return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i)); 3418 #else 3419 vbool4 tmp = a; 3420 tmp[i] = -int(val); 3421 return tmp; 3422 #endif 3423 } 3424 3425 OIIO_FORCEINLINE bool reduce_and (const vbool4& v) { 3426 #if OIIO_SIMD_AVX 3427 return _mm_testc_ps (v, vbool4(true)) != 0; 3428 #elif OIIO_SIMD_SSE 3429 return _mm_movemask_ps(v.simd()) == 0xf; 3430 #else 3431 SIMD_RETURN_REDUCE (bool, true, r &= (v[i] != 0)); 3432 #endif 3433 } 3434 3435 OIIO_FORCEINLINE bool reduce_or (const vbool4& v) { 3436 #if OIIO_SIMD_AVX 3437 return ! _mm_testz_ps (v, v); 3438 #elif OIIO_SIMD_SSE 3439 return _mm_movemask_ps(v) != 0; 3440 #else 3441 SIMD_RETURN_REDUCE (bool, false, r |= (v[i] != 0)); 3442 #endif 3443 } 3444 3445 OIIO_FORCEINLINE bool all (const vbool4& v) { return reduce_and(v) == true; } 3446 OIIO_FORCEINLINE bool any (const vbool4& v) { return reduce_or(v) == true; } 3447 OIIO_FORCEINLINE bool none (const vbool4& v) { return reduce_or(v) == false; } 3448 3449 3450 3451 ////////////////////////////////////////////////////////////////////// 3452 // vbool8 implementation 3453 3454 3455 OIIO_FORCEINLINE int vbool8::operator[] (int i) const { 3456 OIIO_DASSERT(i >= 0 && i < elements); 3457 #if OIIO_SIMD_AVX 3458 return ((_mm256_movemask_ps(m_simd) >> i) & 1) ? -1 : 0; 3459 #else 3460 return m_val[i]; 3461 #endif 3462 } 3463 3464 OIIO_FORCEINLINE void vbool8::setcomp (int i, bool value) { 3465 OIIO_DASSERT(i >= 0 && i < elements); 3466 m_val[i] = value ? -1 : 0; 3467 } 3468 3469 OIIO_FORCEINLINE int& vbool8::operator[] (int i) { 3470 OIIO_DASSERT(i >= 0 && i < elements); 3471 return m_val[i]; 3472 } 3473 3474 3475 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool8& a) { 3476 cout << a[0]; 3477 for (int i = 1; i < a.elements; ++i) 3478 cout << ' ' << a[i]; 3479 return cout; 3480 } 3481 3482 3483 OIIO_FORCEINLINE void vbool8::load (bool a) { 3484 #if OIIO_SIMD_AVX 3485 m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-int(a))); 3486 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 3487 m_4[0].load(a); 3488 m_4[1].load(a); 3489 #else 3490 int val = -int(a); 3491 SIMD_CONSTRUCT (val); 3492 #endif 3493 } 3494 3495 3496 OIIO_FORCEINLINE void vbool8::load (bool a, bool b, bool c, bool d, 3497 bool e, bool f, bool g, bool h) { 3498 #if OIIO_SIMD_AVX 3499 // N.B. -- we need to reverse the order because of our convention 3500 // of storing a,b,c,d in the same order in memory. 3501 m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-int(h), -int(g), -int(f), -int(e), 3502 -int(d), -int(c), -int(b), -int(a))); 3503 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 3504 m_4[0].load(a, b, c, d); 3505 m_4[1].load(e, f, g, h); 3506 #else 3507 m_val[0] = -int(a); 3508 m_val[1] = -int(b); 3509 m_val[2] = -int(c); 3510 m_val[3] = -int(d); 3511 m_val[4] = -int(e); 3512 m_val[5] = -int(f); 3513 m_val[6] = -int(g); 3514 m_val[7] = -int(h); 3515 #endif 3516 } 3517 3518 OIIO_FORCEINLINE vbool8::vbool8 (bool a, bool b, bool c, bool d, 3519 bool e, bool f, bool g, bool h) { 3520 load (a, b, c, d, e, f, g, h); 3521 } 3522 3523 OIIO_FORCEINLINE vbool8::vbool8 (int a, int b, int c, int d, 3524 int e, int f, int g, int h) { 3525 load (bool(a), bool(b), bool(c), bool(d), 3526 bool(e), bool(f), bool(g), bool(h)); 3527 } 3528 3529 OIIO_FORCEINLINE vbool8::vbool8 (const bool *a) { 3530 load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]); 3531 } 3532 3533 3534 OIIO_FORCEINLINE const vbool8& vbool8::operator= (bool a) { 3535 load(a); 3536 return *this; 3537 } 3538 3539 OIIO_FORCEINLINE const vbool8& vbool8::operator= (const vbool8 & other) { 3540 m_simd = other.m_simd; 3541 return *this; 3542 } 3543 3544 OIIO_FORCEINLINE int vbool8::bitmask () const { 3545 #if OIIO_SIMD_AVX 3546 return _mm256_movemask_ps(m_simd); 3547 #else 3548 return lo().bitmask() | (hi().bitmask() << 4); 3549 #endif 3550 } 3551 3552 3553 OIIO_FORCEINLINE vbool8 3554 vbool8::from_bitmask (int bitmask) { 3555 // I think this is a fast conversion from int bitmask to vbool8 3556 return (vint8::Giota() & vint8(bitmask)) != vint8::Zero(); 3557 } 3558 3559 3560 OIIO_FORCEINLINE void vbool8::clear () { 3561 #if OIIO_SIMD_AVX 3562 m_simd = _mm256_setzero_ps(); 3563 #else 3564 *this = false; 3565 #endif 3566 } 3567 3568 OIIO_FORCEINLINE const vbool8 vbool8::False () { 3569 #if OIIO_SIMD_AVX 3570 return _mm256_setzero_ps(); 3571 #else 3572 return false; 3573 #endif 3574 } 3575 3576 3577 OIIO_FORCEINLINE const vbool8 vbool8::True () { 3578 #if OIIO_SIMD_AVX 3579 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000) 3580 // Fastest way to fill with all 1 bits is to cmp any value to itself. 3581 __m256i anyval = _mm256_undefined_si256(); 3582 return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval)); 3583 # else 3584 return _mm256_castsi256_ps (_mm256_set1_epi32 (-1)); 3585 # endif 3586 #else 3587 return true; 3588 #endif 3589 } 3590 3591 3592 OIIO_FORCEINLINE void vbool8::store (bool *values) const { 3593 SIMD_DO (values[i] = m_val[i] ? true : false); 3594 } 3595 3596 OIIO_FORCEINLINE void vbool8::store (bool *values, int n) const { 3597 OIIO_DASSERT (n >= 0 && n <= elements); 3598 for (int i = 0; i < n; ++i) 3599 values[i] = m_val[i] ? true : false; 3600 } 3601 3602 3603 OIIO_FORCEINLINE vbool4 vbool8::lo () const { 3604 #if OIIO_SIMD_AVX 3605 return _mm256_castps256_ps128 (simd()); 3606 #else 3607 return m_4[0]; 3608 #endif 3609 } 3610 3611 OIIO_FORCEINLINE vbool4 vbool8::hi () const { 3612 #if OIIO_SIMD_AVX 3613 return _mm256_extractf128_ps (simd(), 1); 3614 #else 3615 return m_4[1]; 3616 #endif 3617 } 3618 3619 3620 OIIO_FORCEINLINE vbool8::vbool8 (const vbool4& lo, const vbool4 &hi) { 3621 #if OIIO_SIMD_AVX 3622 __m256 r = _mm256_castps128_ps256 (lo); 3623 m_simd = _mm256_insertf128_ps (r, hi, 1); 3624 // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo); 3625 #else 3626 m_4[0] = lo; 3627 m_4[1] = hi; 3628 #endif 3629 } 3630 3631 3632 OIIO_FORCEINLINE vbool8 operator! (const vbool8 & a) { 3633 #if OIIO_SIMD_AVX 3634 return _mm256_xor_ps (a.simd(), vbool8::True()); 3635 #else 3636 SIMD_RETURN (vbool8, a[i] ^ (-1)); 3637 #endif 3638 } 3639 3640 OIIO_FORCEINLINE vbool8 operator& (const vbool8 & a, const vbool8 & b) { 3641 #if OIIO_SIMD_AVX 3642 return _mm256_and_ps (a.simd(), b.simd()); 3643 #else 3644 SIMD_RETURN (vbool8, a[i] & b[i]); 3645 #endif 3646 } 3647 3648 OIIO_FORCEINLINE vbool8 operator| (const vbool8 & a, const vbool8 & b) { 3649 #if OIIO_SIMD_AVX 3650 return _mm256_or_ps (a.simd(), b.simd()); 3651 #else 3652 SIMD_RETURN (vbool8, a[i] | b[i]); 3653 #endif 3654 } 3655 3656 OIIO_FORCEINLINE vbool8 operator^ (const vbool8& a, const vbool8& b) { 3657 #if OIIO_SIMD_AVX 3658 return _mm256_xor_ps (a.simd(), b.simd()); 3659 #else 3660 SIMD_RETURN (vbool8, a[i] ^ b[i]); 3661 #endif 3662 } 3663 3664 3665 OIIO_FORCEINLINE const vbool8& operator&= (vbool8& a, const vbool8 &b) { 3666 return a = a & b; 3667 } 3668 3669 OIIO_FORCEINLINE const vbool8& operator|= (vbool8& a, const vbool8& b) { 3670 return a = a | b; 3671 } 3672 3673 OIIO_FORCEINLINE const vbool8& operator^= (vbool8& a, const vbool8& b) { 3674 return a = a ^ b; 3675 } 3676 3677 3678 OIIO_FORCEINLINE vbool8 operator~ (const vbool8& a) { 3679 #if OIIO_SIMD_AVX 3680 // Fastest way to bit-complement in SSE is to xor with 0xffffffff. 3681 return _mm256_xor_ps (a.simd(), vbool8::True()); 3682 #else 3683 SIMD_RETURN (vbool8, ~a[i]); 3684 #endif 3685 } 3686 3687 3688 OIIO_FORCEINLINE vbool8 operator== (const vbool8 & a, const vbool8 & b) { 3689 #if OIIO_SIMD_AVX >= 2 3690 return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b))); 3691 #elif OIIO_SIMD_AVX 3692 return _mm256_cmp_ps (a, b, _CMP_EQ_UQ); 3693 #else 3694 SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0); 3695 #endif 3696 } 3697 3698 OIIO_FORCEINLINE vbool8 operator!= (const vbool8 & a, const vbool8 & b) { 3699 #if OIIO_SIMD_AVX 3700 return _mm256_xor_ps (a, b); 3701 #else 3702 SIMD_RETURN (vbool8, a[i] != b[i] ? -1 : 0); 3703 #endif 3704 } 3705 3706 3707 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 3708 OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) { 3709 #if OIIO_SIMD_AVX >= 2 3710 vint8 index (i0, i1, i2, i3, i4, i5, i6, i7); 3711 return _mm256_permutevar8x32_ps (a.simd(), index.simd()); 3712 #else 3713 return vbool8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]); 3714 #endif 3715 } 3716 3717 template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) { 3718 return shuffle<i,i,i,i,i,i,i,i>(a); 3719 } 3720 3721 3722 template<int i> 3723 OIIO_FORCEINLINE bool extract (const vbool8& a) { 3724 #if OIIO_SIMD_AVX && !_WIN32 3725 return _mm256_extract_epi32(_mm256_castps_si256(a.simd()), i); // SSE4.1 only 3726 #else 3727 return a[i]; 3728 #endif 3729 } 3730 3731 template<int i> 3732 OIIO_FORCEINLINE vbool8 insert (const vbool8& a, bool val) { 3733 #if OIIO_SIMD_AVX && !_WIN32 3734 int ival = -int(val); 3735 return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.simd()), ival, i)); 3736 #else 3737 vbool8 tmp = a; 3738 tmp[i] = -int(val); 3739 return tmp; 3740 #endif 3741 } 3742 3743 3744 OIIO_FORCEINLINE bool reduce_and (const vbool8& v) { 3745 #if OIIO_SIMD_AVX 3746 return _mm256_testc_ps (v, vbool8(true)) != 0; 3747 // return _mm256_movemask_ps(v.simd()) == 0xff; 3748 #else 3749 SIMD_RETURN_REDUCE (bool, true, r &= bool(v[i])); 3750 #endif 3751 } 3752 3753 OIIO_FORCEINLINE bool reduce_or (const vbool8& v) { 3754 #if OIIO_SIMD_AVX 3755 return ! _mm256_testz_ps (v, v); // FIXME? Not in all immintrin.h ! 3756 // return _mm256_movemask_ps(v) != 0; 3757 #else 3758 SIMD_RETURN_REDUCE (bool, false, r |= bool(v[i])); 3759 #endif 3760 } 3761 3762 3763 OIIO_FORCEINLINE bool all (const vbool8& v) { return reduce_and(v) == true; } 3764 OIIO_FORCEINLINE bool any (const vbool8& v) { return reduce_or(v) == true; } 3765 OIIO_FORCEINLINE bool none (const vbool8& v) { return reduce_or(v) == false; } 3766 3767 3768 3769 ////////////////////////////////////////////////////////////////////// 3770 // vbool16 implementation 3771 3772 3773 OIIO_FORCEINLINE int vbool16::operator[] (int i) const { 3774 OIIO_DASSERT(i >= 0 && i < elements); 3775 #if OIIO_SIMD_AVX >= 512 3776 return (int(m_simd) >> i) & 1; 3777 #else 3778 return (m_bits >> i) & 1; 3779 #endif 3780 } 3781 3782 OIIO_FORCEINLINE void vbool16::setcomp (int i, bool value) { 3783 OIIO_DASSERT(i >= 0 && i < elements); 3784 int bits = m_bits; 3785 bits &= (0xffff ^ (1<<i)); 3786 bits |= (int(value)<<i); 3787 m_bits = bits; 3788 } 3789 3790 3791 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool16& a) { 3792 cout << a[0]; 3793 for (int i = 1; i < a.elements; ++i) 3794 cout << ' ' << a[i]; 3795 return cout; 3796 } 3797 3798 3799 OIIO_FORCEINLINE void vbool16::load (bool a) { 3800 m_simd = a ? 0xffff : 0; 3801 } 3802 3803 3804 OIIO_FORCEINLINE void vbool16::load_bitmask (int a) { 3805 m_simd = simd_t(a); 3806 } 3807 3808 3809 OIIO_FORCEINLINE void vbool16::load (bool v0, bool v1, bool v2, bool v3, 3810 bool v4, bool v5, bool v6, bool v7, 3811 bool v8, bool v9, bool v10, bool v11, 3812 bool v12, bool v13, bool v14, bool v15) { 3813 m_simd = simd_t((int(v0) << 0) | 3814 (int(v1) << 1) | 3815 (int(v2) << 2) | 3816 (int(v3) << 3) | 3817 (int(v4) << 4) | 3818 (int(v5) << 5) | 3819 (int(v6) << 6) | 3820 (int(v7) << 7) | 3821 (int(v8) << 8) | 3822 (int(v9) << 9) | 3823 (int(v10) << 10) | 3824 (int(v11) << 11) | 3825 (int(v12) << 12) | 3826 (int(v13) << 13) | 3827 (int(v14) << 14) | 3828 (int(v15) << 15)); 3829 } 3830 3831 OIIO_FORCEINLINE vbool16::vbool16 (bool v0, bool v1, bool v2, bool v3, 3832 bool v4, bool v5, bool v6, bool v7, 3833 bool v8, bool v9, bool v10, bool v11, 3834 bool v12, bool v13, bool v14, bool v15) { 3835 load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); 3836 } 3837 3838 OIIO_FORCEINLINE vbool16::vbool16 (int v0, int v1, int v2, int v3, 3839 int v4, int v5, int v6, int v7, 3840 int v8, int v9, int v10, int v11, 3841 int v12, int v13, int v14, int v15) { 3842 load (bool(v0), bool(v1), bool(v2), bool(v3), 3843 bool(v4), bool(v5), bool(v6), bool(v7), 3844 bool(v8), bool(v9), bool(v10), bool(v11), 3845 bool(v12), bool(v13), bool(v14), bool(v15)); 3846 } 3847 3848 OIIO_FORCEINLINE vbool16::vbool16 (const vbool8& a, const vbool8& b) { 3849 load_bitmask (a.bitmask() | (b.bitmask() << 8)); 3850 } 3851 3852 OIIO_FORCEINLINE vbool16::vbool16 (const bool *a) { 3853 load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], 3854 a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]); 3855 } 3856 3857 3858 OIIO_FORCEINLINE const vbool16& vbool16::operator= (bool a) { 3859 load(a); 3860 return *this; 3861 } 3862 3863 OIIO_FORCEINLINE const vbool16& vbool16::operator= (const vbool16 & other) { 3864 m_simd = other.m_simd; 3865 return *this; 3866 } 3867 3868 3869 OIIO_FORCEINLINE int vbool16::bitmask () const { 3870 #if OIIO_SIMD_AVX >= 512 3871 return int(m_simd); 3872 #else 3873 return int(m_bits); 3874 #endif 3875 } 3876 3877 3878 OIIO_FORCEINLINE void vbool16::clear () { 3879 m_simd = simd_t(0); 3880 } 3881 3882 OIIO_FORCEINLINE const vbool16 vbool16::False () { 3883 return simd_t(0); 3884 } 3885 3886 3887 OIIO_FORCEINLINE const vbool16 vbool16::True () { 3888 return simd_t(0xffff); 3889 } 3890 3891 3892 OIIO_FORCEINLINE void vbool16::store (bool *values) const { 3893 SIMD_DO (values[i] = m_bits & (1<<i)); 3894 } 3895 3896 OIIO_FORCEINLINE void vbool16::store (bool *values, int n) const { 3897 OIIO_DASSERT (n >= 0 && n <= elements); 3898 for (int i = 0; i < n; ++i) 3899 values[i] = m_bits & (1<<i); 3900 } 3901 3902 3903 3904 OIIO_FORCEINLINE vbool8 vbool16::lo () const { 3905 #if OIIO_SIMD_AVX >= 512 3906 return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()&0xff, -1)); 3907 #else 3908 SIMD_RETURN (vbool8, (*this)[i] ? -1 : 0); 3909 #endif 3910 } 3911 3912 OIIO_FORCEINLINE vbool8 vbool16::hi () const { 3913 #if OIIO_SIMD_AVX >= 512 3914 return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()>>8, -1)); 3915 #else 3916 SIMD_RETURN (vbool8, (*this)[i+8] ? -1 : 0); 3917 #endif 3918 } 3919 3920 3921 OIIO_FORCEINLINE vbool16 operator! (const vbool16 & a) { 3922 #if OIIO_SIMD_AVX >= 512 3923 return _mm512_knot (a.simd()); 3924 #else 3925 return vbool16 (a.m_bits ^ 0xffff); 3926 #endif 3927 } 3928 3929 OIIO_FORCEINLINE vbool16 operator& (const vbool16 & a, const vbool16 & b) { 3930 #if OIIO_SIMD_AVX >= 512 3931 return _mm512_kand (a.simd(), b.simd()); 3932 #else 3933 return vbool16 (a.m_bits & b.m_bits); 3934 #endif 3935 } 3936 3937 OIIO_FORCEINLINE vbool16 operator| (const vbool16 & a, const vbool16 & b) { 3938 #if OIIO_SIMD_AVX >= 512 3939 return _mm512_kor (a.simd(), b.simd()); 3940 #else 3941 return vbool16 (a.m_bits | b.m_bits); 3942 #endif 3943 } 3944 3945 OIIO_FORCEINLINE vbool16 operator^ (const vbool16& a, const vbool16& b) { 3946 #if OIIO_SIMD_AVX >= 512 3947 return _mm512_kxor (a.simd(), b.simd()); 3948 #else 3949 return vbool16 (a.m_bits ^ b.m_bits); 3950 #endif 3951 } 3952 3953 3954 OIIO_FORCEINLINE const vbool16& operator&= (vbool16& a, const vbool16 &b) { 3955 return a = a & b; 3956 } 3957 3958 OIIO_FORCEINLINE const vbool16& operator|= (vbool16& a, const vbool16& b) { 3959 return a = a | b; 3960 } 3961 3962 OIIO_FORCEINLINE const vbool16& operator^= (vbool16& a, const vbool16& b) { 3963 return a = a ^ b; 3964 } 3965 3966 3967 OIIO_FORCEINLINE vbool16 operator~ (const vbool16& a) { 3968 return a ^ vbool16::True(); 3969 } 3970 3971 3972 OIIO_FORCEINLINE vbool16 operator== (const vbool16 & a, const vbool16 & b) { 3973 #if OIIO_SIMD_AVX >= 512 3974 return _mm512_kxnor (a.simd(), b.simd()); 3975 #else 3976 return vbool16 (!(a.m_bits ^ b.m_bits)); 3977 #endif 3978 } 3979 3980 OIIO_FORCEINLINE vbool16 operator!= (const vbool16 & a, const vbool16 & b) { 3981 #if OIIO_SIMD_AVX >= 512 3982 return _mm512_kxor (a.simd(), b.simd()); 3983 #else 3984 return vbool16 (a.m_bits ^ b.m_bits); 3985 #endif 3986 } 3987 3988 3989 template<int i> 3990 OIIO_FORCEINLINE bool extract (const vbool16& a) { 3991 return a[i]; 3992 } 3993 3994 template<int i> 3995 OIIO_FORCEINLINE vbool16 insert (const vbool16& a, bool val) { 3996 vbool16 tmp = a; 3997 tmp.setcomp (i, val); 3998 return tmp; 3999 } 4000 4001 4002 OIIO_FORCEINLINE bool reduce_and (const vbool16& v) { 4003 return v.bitmask() == 0xffff; 4004 } 4005 4006 OIIO_FORCEINLINE bool reduce_or (const vbool16& v) { 4007 return v.bitmask() != 0; 4008 } 4009 4010 4011 OIIO_FORCEINLINE bool all (const vbool16& v) { return reduce_and(v) == true; } 4012 OIIO_FORCEINLINE bool any (const vbool16& v) { return reduce_or(v) == true; } 4013 OIIO_FORCEINLINE bool none (const vbool16& v) { return reduce_or(v) == false; } 4014 4015 4016 4017 4018 4019 4020 ////////////////////////////////////////////////////////////////////// 4021 // vint4 implementation 4022 4023 OIIO_FORCEINLINE const vint4 & vint4::operator= (const vint4& other) { 4024 m_simd = other.m_simd; 4025 return *this; 4026 } 4027 4028 OIIO_FORCEINLINE int vint4::operator[] (int i) const { 4029 OIIO_DASSERT(i<elements); 4030 return m_val[i]; 4031 } 4032 4033 OIIO_FORCEINLINE int& vint4::operator[] (int i) { 4034 OIIO_DASSERT(i<elements); 4035 return m_val[i]; 4036 } 4037 4038 OIIO_FORCEINLINE void vint4::setcomp (int i, int val) { 4039 OIIO_DASSERT(i<elements); 4040 m_val[i] = val; 4041 } 4042 4043 4044 OIIO_FORCEINLINE void vint4::load (int a) { 4045 #if OIIO_SIMD_SSE 4046 m_simd = _mm_set1_epi32 (a); 4047 #elif OIIO_SIMD_NEON 4048 m_simd = vdupq_n_s32 (a); 4049 #else 4050 SIMD_CONSTRUCT (a); 4051 #endif 4052 } 4053 4054 4055 4056 OIIO_FORCEINLINE void vint4::load (int a, int b, int c, int d) { 4057 #if OIIO_SIMD_SSE 4058 m_simd = _mm_set_epi32 (d, c, b, a); 4059 #elif OIIO_SIMD_NEON 4060 int values[4] = { a, b, c, d }; 4061 m_simd = vld1q_s32 (values); 4062 #else 4063 m_val[0] = a; 4064 m_val[1] = b; 4065 m_val[2] = c; 4066 m_val[3] = d; 4067 #endif 4068 } 4069 4070 4071 // OIIO_FORCEINLINE void vint4::load (int a, int b, int c, int d, 4072 // int e, int f, int g, int h) { 4073 // load (a, b, c, d); 4074 // } 4075 4076 4077 4078 OIIO_FORCEINLINE void vint4::load (const int *values) { 4079 #if OIIO_SIMD_SSE 4080 m_simd = _mm_loadu_si128 ((const simd_t *)values); 4081 #else 4082 SIMD_CONSTRUCT (values[i]); 4083 #endif 4084 } 4085 4086 4087 OIIO_FORCEINLINE void vint4::load (const int *values, int n) 4088 { 4089 OIIO_DASSERT (n >= 0 && n <= elements); 4090 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 4091 m_simd = _mm_maskz_loadu_epi32 (__mmask8(~(0xf << n)), values); 4092 #elif OIIO_SIMD_SSE 4093 switch (n) { 4094 case 1: 4095 m_simd = _mm_castps_si128 (_mm_load_ss ((const float *)values)); 4096 break; 4097 case 2: 4098 // Trickery: load one double worth of bits! 4099 m_simd = _mm_castpd_si128 (_mm_load_sd ((const double*)values)); 4100 break; 4101 case 3: 4102 // Trickery: load one double worth of bits, then a float, 4103 // and combine, casting to ints. 4104 m_simd = _mm_castps_si128 (_mm_movelh_ps(_mm_castpd_ps(_mm_load_sd((const double*)values)), 4105 _mm_load_ss ((const float *)values + 2))); 4106 break; 4107 case 4: 4108 m_simd = _mm_loadu_si128 ((const simd_t *)values); 4109 break; 4110 default: 4111 clear (); 4112 break; 4113 } 4114 #else 4115 for (int i = 0; i < n; ++i) 4116 m_val[i] = values[i]; 4117 for (int i = n; i < elements; ++i) 4118 m_val[i] = 0; 4119 #endif 4120 } 4121 4122 4123 OIIO_FORCEINLINE void vint4::load (const unsigned short *values) { 4124 #if OIIO_SIMD_SSE >= 4 4125 // Trickery: load one double worth of bits = 4 ushorts! 4126 simd_t a = _mm_castpd_si128 (_mm_load_sd ((const double *)values)); 4127 m_simd = _mm_cvtepu16_epi32 (a); 4128 #else 4129 SIMD_CONSTRUCT (values[i]); 4130 #endif 4131 } 4132 4133 4134 OIIO_FORCEINLINE void vint4::load (const short *values) { 4135 #if OIIO_SIMD_SSE >= 4 4136 // Trickery: load one double worth of bits = 4 shorts! 4137 simd_t a = _mm_castpd_si128 (_mm_load_sd ((const double *)values)); 4138 m_simd = _mm_cvtepi16_epi32 (a); 4139 #else 4140 SIMD_CONSTRUCT (values[i]); 4141 #endif 4142 } 4143 4144 4145 OIIO_FORCEINLINE void vint4::load (const unsigned char *values) { 4146 #if OIIO_SIMD_SSE >= 4 4147 // Trickery: load one float worth of bits = 4 uchars! 4148 simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values)); 4149 m_simd = _mm_cvtepu8_epi32 (a); 4150 #else 4151 SIMD_CONSTRUCT (values[i]); 4152 #endif 4153 } 4154 4155 4156 OIIO_FORCEINLINE void vint4::load (const char *values) { 4157 #if OIIO_SIMD_SSE >= 4 4158 // Trickery: load one float worth of bits = 4 chars! 4159 simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values)); 4160 m_simd = _mm_cvtepi8_epi32 (a); 4161 #else 4162 SIMD_CONSTRUCT (values[i]); 4163 #endif 4164 } 4165 4166 4167 OIIO_FORCEINLINE vint4::vint4 (int a) { load(a); } 4168 4169 OIIO_FORCEINLINE vint4::vint4 (int a, int b) { load(a,a,b,b); } 4170 4171 OIIO_FORCEINLINE vint4::vint4 (int a, int b, int c, int d) { load(a,b,c,d); } 4172 4173 // OIIO_FORCEINLINE vint4::vint4 (int a, int b, int c, int d, 4174 // int e, int f, int g, int h) { 4175 // load(a,b,c,d,e,f,g,h); 4176 // } 4177 4178 OIIO_FORCEINLINE vint4::vint4 (const int *vals) { load (vals); } 4179 OIIO_FORCEINLINE vint4::vint4 (const unsigned short *vals) { load(vals); } 4180 OIIO_FORCEINLINE vint4::vint4 (const short *vals) { load(vals); } 4181 OIIO_FORCEINLINE vint4::vint4 (const unsigned char *vals) { load(vals); } 4182 OIIO_FORCEINLINE vint4::vint4 (const char *vals) { load(vals); } 4183 4184 OIIO_FORCEINLINE const vint4 & vint4::operator= (int a) { load(a); return *this; } 4185 4186 4187 OIIO_FORCEINLINE void vint4::store (int *values) const { 4188 #if OIIO_SIMD_SSE 4189 // Use an unaligned store -- it's just as fast when the memory turns 4190 // out to be aligned, nearly as fast even when unaligned. Not worth 4191 // the headache of using stores that require alignment. 4192 _mm_storeu_si128 ((simd_t *)values, m_simd); 4193 #else 4194 SIMD_DO (values[i] = m_val[i]); 4195 #endif 4196 } 4197 4198 4199 OIIO_FORCEINLINE void vint4::load_mask (int mask, const value_t *values) { 4200 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 4201 m_simd = _mm_maskz_loadu_epi32 (__mmask8(mask), (const simd_t *)values); 4202 #elif OIIO_SIMD_AVX >= 2 4203 m_simd = _mm_maskload_epi32 (values, _mm_castps_si128(vbool_t::from_bitmask(mask))); 4204 #else 4205 SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0); 4206 #endif 4207 } 4208 4209 4210 OIIO_FORCEINLINE void vint4::load_mask (const vbool_t& mask, const value_t *values) { 4211 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 4212 m_simd = _mm_maskz_loadu_epi32 (__mmask8(mask.bitmask()), (const simd_t *)values); 4213 #elif OIIO_SIMD_AVX >= 2 4214 m_simd = _mm_maskload_epi32 (values, _mm_castps_si128(mask)); 4215 #else 4216 SIMD_CONSTRUCT (mask[i] ? values[i] : 0); 4217 #endif 4218 } 4219 4220 4221 OIIO_FORCEINLINE void vint4::store_mask (int mask, value_t *values) const { 4222 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 4223 _mm_mask_storeu_epi32 (values, __mmask8(mask), m_simd); 4224 #elif OIIO_SIMD_AVX >= 2 4225 _mm_maskstore_epi32 (values, _mm_castps_si128(vbool_t::from_bitmask(mask)), m_simd); 4226 #else 4227 SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]); 4228 #endif 4229 } 4230 4231 4232 OIIO_FORCEINLINE void vint4::store_mask (const vbool_t& mask, value_t *values) const { 4233 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 4234 _mm_mask_storeu_epi32 (values, mask.bitmask(), m_simd); 4235 #elif OIIO_SIMD_AVX >= 2 4236 _mm_maskstore_epi32 (values, _mm_castps_si128(mask), m_simd); 4237 #else 4238 SIMD_DO (if (mask[i]) values[i] = (*this)[i]); 4239 #endif 4240 } 4241 4242 4243 template <int scale> 4244 OIIO_FORCEINLINE void 4245 vint4::gather (const value_t *baseptr, const vint_t& vindex) 4246 { 4247 #if OIIO_SIMD_AVX >= 2 4248 m_simd = _mm_i32gather_epi32 (baseptr, vindex, scale); 4249 #else 4250 SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale)); 4251 #endif 4252 } 4253 4254 template<int scale> 4255 OIIO_FORCEINLINE void 4256 vint4::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex) 4257 { 4258 #if OIIO_SIMD_AVX >= 2 4259 m_simd = _mm_mask_i32gather_epi32 (m_simd, baseptr, vindex, _mm_cvtps_epi32(mask), scale); 4260 #else 4261 SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0); 4262 #endif 4263 } 4264 4265 template<int scale> 4266 OIIO_FORCEINLINE void 4267 vint4::scatter (value_t *baseptr, const vint_t& vindex) const 4268 { 4269 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 4270 // FIXME: disable because it benchmarks slower than the dumb way 4271 _mm_i32scatter_epi32 (baseptr, vindex, m_simd, scale); 4272 #else 4273 SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]); 4274 #endif 4275 } 4276 4277 template<int scale> 4278 OIIO_FORCEINLINE void 4279 vint4::scatter_mask (const bool_t& mask, value_t *baseptr, 4280 const vint_t& vindex) const 4281 { 4282 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 4283 // FIXME: disable because it benchmarks slower than the dumb way 4284 _mm_mask_i32scatter_epi32 (baseptr, mask.bitmask(), vindex, m_simd, scale); 4285 #else 4286 SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]); 4287 #endif 4288 } 4289 4290 4291 OIIO_FORCEINLINE void vint4::clear () { 4292 #if OIIO_SIMD_SSE 4293 m_simd = _mm_setzero_si128(); 4294 #else 4295 *this = 0; 4296 #endif 4297 } 4298 4299 4300 4301 OIIO_FORCEINLINE const vint4 vint4::Zero () { 4302 #if OIIO_SIMD_SSE 4303 return _mm_setzero_si128(); 4304 #else 4305 return 0; 4306 #endif 4307 } 4308 4309 4310 OIIO_FORCEINLINE const vint4 vint4::One () { return vint4(1); } 4311 4312 OIIO_FORCEINLINE const vint4 vint4::NegOne () { 4313 #if OIIO_SIMD_SSE 4314 // Fastest way to fill an __m128 with all 1 bits is to cmpeq_epi8 4315 // any value to itself. 4316 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000) 4317 __m128i anyval = _mm_undefined_si128(); 4318 # else 4319 __m128i anyval = _mm_setzero_si128(); 4320 # endif 4321 return _mm_cmpeq_epi8 (anyval, anyval); 4322 #else 4323 return vint4(-1); 4324 #endif 4325 } 4326 4327 4328 4329 OIIO_FORCEINLINE const vint4 vint4::Iota (int start, int step) { 4330 return vint4 (start+0*step, start+1*step, start+2*step, start+3*step); 4331 } 4332 4333 4334 OIIO_FORCEINLINE const vint4 vint4::Giota () { 4335 return vint4 (1<<0, 1<<1, 1<<2, 1<<3); 4336 } 4337 4338 4339 OIIO_FORCEINLINE vint4 operator+ (const vint4& a, const vint4& b) { 4340 #if OIIO_SIMD_SSE 4341 return _mm_add_epi32 (a.simd(), b.simd()); 4342 #else 4343 SIMD_RETURN (vint4, a[i] + b[i]); 4344 #endif 4345 } 4346 4347 OIIO_FORCEINLINE const vint4& operator+= (vint4& a, const vint4& b) { 4348 return a = a + b; 4349 } 4350 4351 4352 OIIO_FORCEINLINE vint4 operator- (const vint4& a) { 4353 #if OIIO_SIMD_SSE 4354 return _mm_sub_epi32 (_mm_setzero_si128(), a); 4355 #else 4356 SIMD_RETURN (vint4, -a[i]); 4357 #endif 4358 } 4359 4360 4361 OIIO_FORCEINLINE vint4 operator- (const vint4& a, const vint4& b) { 4362 #if OIIO_SIMD_SSE 4363 return _mm_sub_epi32 (a.simd(), b.simd()); 4364 #else 4365 SIMD_RETURN (vint4, a[i] - b[i]); 4366 #endif 4367 } 4368 4369 4370 OIIO_FORCEINLINE const vint4 &operator-= (vint4& a, const vint4& b) { 4371 return a = a - b; 4372 } 4373 4374 4375 #if OIIO_SIMD_SSE 4376 // Shamelessly lifted from Syrah which lifted from Manta which lifted it 4377 // from intel.com 4378 OIIO_FORCEINLINE __m128i mul_epi32 (__m128i a, __m128i b) { 4379 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 4380 return _mm_mullo_epi32(a, b); 4381 #else 4382 // Prior to SSE 4.1, there is no _mm_mullo_epi32 instruction, so we have 4383 // to fake it. 4384 __m128i t0; 4385 __m128i t1; 4386 t0 = _mm_mul_epu32 (a, b); 4387 t1 = _mm_mul_epu32 (_mm_shuffle_epi32 (a, 0xB1), 4388 _mm_shuffle_epi32 (b, 0xB1)); 4389 t0 = _mm_shuffle_epi32 (t0, 0xD8); 4390 t1 = _mm_shuffle_epi32 (t1, 0xD8); 4391 return _mm_unpacklo_epi32 (t0, t1); 4392 #endif 4393 } 4394 #endif 4395 4396 4397 OIIO_FORCEINLINE vint4 operator* (const vint4& a, const vint4& b) { 4398 #if OIIO_SIMD_SSE 4399 return mul_epi32 (a.simd(), b.simd()); 4400 #else 4401 SIMD_RETURN (vint4, a[i] * b[i]); 4402 #endif 4403 } 4404 4405 4406 OIIO_FORCEINLINE const vint4& operator*= (vint4& a, const vint4& b) { return a = a * b; } 4407 OIIO_FORCEINLINE const vint4& operator*= (vint4& a, int b) { return a = a * b; } 4408 4409 4410 OIIO_FORCEINLINE vint4 operator/ (const vint4& a, const vint4& b) { 4411 // NO INTEGER DIVISION IN SSE! 4412 SIMD_RETURN (vint4, a[i] / b[i]); 4413 } 4414 4415 4416 OIIO_FORCEINLINE const vint4& operator/= (vint4& a, const vint4& b) { return a = a / b; } 4417 4418 OIIO_FORCEINLINE vint4 operator% (const vint4& a, const vint4& b) { 4419 // NO INTEGER MODULUS IN SSE! 4420 SIMD_RETURN (vint4, a[i] % b[i]); 4421 } 4422 4423 4424 4425 OIIO_FORCEINLINE const vint4& operator%= (vint4& a, const vint4& b) { return a = a % b; } 4426 4427 4428 OIIO_FORCEINLINE vint4 operator% (const vint4& a, int w) { 4429 // NO INTEGER MODULUS in SSE! 4430 SIMD_RETURN (vint4, a[i] % w); 4431 } 4432 4433 4434 OIIO_FORCEINLINE const vint4& operator%= (vint4& a, int b) { return a = a % b; } 4435 4436 4437 OIIO_FORCEINLINE vint4 operator& (const vint4& a, const vint4& b) { 4438 #if OIIO_SIMD_SSE 4439 return _mm_and_si128 (a.simd(), b.simd()); 4440 #elif OIIO_SIMD_NEON 4441 return vandq_s32(a.simd(), b.simd()); 4442 #else 4443 SIMD_RETURN (vint4, a[i] & b[i]); 4444 #endif 4445 } 4446 4447 4448 OIIO_FORCEINLINE const vint4& operator&= (vint4& a, const vint4& b) { return a = a & b; } 4449 4450 4451 4452 OIIO_FORCEINLINE vint4 operator| (const vint4& a, const vint4& b) { 4453 #if OIIO_SIMD_SSE 4454 return _mm_or_si128 (a.simd(), b.simd()); 4455 #elif OIIO_SIMD_NEON 4456 return vorrq_s32(a.simd(), b.simd()); 4457 #else 4458 SIMD_RETURN (vint4, a[i] | b[i]); 4459 #endif 4460 } 4461 4462 OIIO_FORCEINLINE const vint4& operator|= (vint4& a, const vint4& b) { return a = a | b; } 4463 4464 4465 OIIO_FORCEINLINE vint4 operator^ (const vint4& a, const vint4& b) { 4466 #if OIIO_SIMD_SSE 4467 return _mm_xor_si128 (a.simd(), b.simd()); 4468 #elif OIIO_SIMD_NEON 4469 return veorq_s32(a.simd(), b.simd()); 4470 #else 4471 SIMD_RETURN (vint4, a[i] ^ b[i]); 4472 #endif 4473 } 4474 4475 4476 OIIO_FORCEINLINE const vint4& operator^= (vint4& a, const vint4& b) { return a = a ^ b; } 4477 4478 4479 OIIO_FORCEINLINE vint4 operator~ (const vint4& a) { 4480 #if OIIO_SIMD_SSE 4481 return a ^ a.NegOne(); 4482 #elif OIIO_SIMD_NEON 4483 return vmvnq_s32(a.m_simd); 4484 #else 4485 SIMD_RETURN (vint4, ~a[i]); 4486 #endif 4487 } 4488 4489 OIIO_FORCEINLINE vint4 operator<< (const vint4& a, unsigned int bits) { 4490 #if OIIO_SIMD_SSE 4491 return _mm_slli_epi32 (a, bits); 4492 #else 4493 SIMD_RETURN (vint4, a[i] << bits); 4494 #endif 4495 } 4496 4497 OIIO_FORCEINLINE const vint4& operator<<= (vint4& a, const unsigned int bits) { 4498 return a = a << bits; 4499 } 4500 4501 4502 OIIO_FORCEINLINE vint4 operator>> (const vint4& a, const unsigned int bits) { 4503 #if OIIO_SIMD_SSE 4504 return _mm_srai_epi32 (a, bits); 4505 #else 4506 SIMD_RETURN (vint4, a[i] >> bits); 4507 #endif 4508 } 4509 4510 OIIO_FORCEINLINE const vint4& operator>>= (vint4& a, const unsigned int bits) { 4511 return a = a >> bits; 4512 } 4513 4514 4515 OIIO_FORCEINLINE vint4 srl (const vint4& a, const unsigned int bits) { 4516 #if OIIO_SIMD_SSE 4517 return _mm_srli_epi32 (a, bits); 4518 #else 4519 SIMD_RETURN (vint4, int ((unsigned int)(a[i]) >> bits)); 4520 #endif 4521 } 4522 4523 4524 OIIO_FORCEINLINE vbool4 operator== (const vint4& a, const vint4& b) { 4525 #if OIIO_SIMD_SSE 4526 return _mm_castsi128_ps(_mm_cmpeq_epi32 (a, b)); 4527 #elif OIIO_SIMD_NEON 4528 return vceqq_s32 (a.m_simd, b.m_simd); 4529 #else 4530 SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0); 4531 #endif 4532 } 4533 4534 OIIO_FORCEINLINE vbool4 operator!= (const vint4& a, const vint4& b) { 4535 return ! (a == b); 4536 } 4537 4538 4539 OIIO_FORCEINLINE vbool4 operator> (const vint4& a, const vint4& b) { 4540 #if OIIO_SIMD_SSE 4541 return _mm_castsi128_ps(_mm_cmpgt_epi32 (a, b)); 4542 #else 4543 SIMD_RETURN (vbool4, a[i] > b[i] ? -1 : 0); 4544 #endif 4545 } 4546 4547 OIIO_FORCEINLINE vbool4 operator< (const vint4& a, const vint4& b) { 4548 #if OIIO_SIMD_SSE 4549 return _mm_castsi128_ps(_mm_cmplt_epi32 (a, b)); 4550 #else 4551 SIMD_RETURN (vbool4, a[i] < b[i] ? -1 : 0); 4552 #endif 4553 } 4554 4555 OIIO_FORCEINLINE vbool4 operator>= (const vint4& a, const vint4& b) { 4556 return (b < a) | (a == b); 4557 } 4558 4559 OIIO_FORCEINLINE vbool4 operator<= (const vint4& a, const vint4& b) { 4560 return (b > a) | (a == b); 4561 } 4562 4563 inline std::ostream& operator<< (std::ostream& cout, const vint4& val) { 4564 cout << val[0]; 4565 for (int i = 1; i < val.elements; ++i) 4566 cout << ' ' << val[i]; 4567 return cout; 4568 } 4569 4570 4571 OIIO_FORCEINLINE void vint4::store (int *values, int n) const { 4572 OIIO_DASSERT (n >= 0 && n <= elements); 4573 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 4574 // This SHOULD be fast, but in my benchmarks, it is slower! 4575 // (At least on the AVX512 hardware I have, Xeon Silver 4110.) 4576 // Re-test this periodically with new Intel hardware. 4577 _mm_mask_storeu_epi32 (values, __mmask8(~(0xf << n)), m_simd); 4578 #elif OIIO_SIMD 4579 // For full SIMD, there is a speed advantage to storing all components. 4580 if (n == elements) 4581 store (values); 4582 else 4583 for (int i = 0; i < n; ++i) 4584 values[i] = m_val[i]; 4585 #else 4586 for (int i = 0; i < n; ++i) 4587 values[i] = m_val[i]; 4588 #endif 4589 } 4590 4591 4592 4593 OIIO_FORCEINLINE void vint4::store (unsigned short *values) const { 4594 #if OIIO_AVX512VL_ENABLED 4595 _mm_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xf), m_simd); 4596 #elif OIIO_SIMD_SSE 4597 // Expressed as half-words and considering little endianness, we 4598 // currently have AxBxCxDx (the 'x' means don't care). 4599 vint4 clamped = m_simd & vint4(0xffff); // A0B0C0D0 4600 vint4 low = _mm_shufflelo_epi16 (clamped, (0<<0) | (2<<2) | (1<<4) | (1<<6)); 4601 // low = AB00xxxx 4602 vint4 high = _mm_shufflehi_epi16 (clamped, (1<<0) | (1<<2) | (0<<4) | (2<<6)); 4603 // high = xxxx00CD 4604 vint4 highswapped = shuffle_sse<2,3,0,1>(high); // 00CDxxxx 4605 vint4 result = low | highswapped; // ABCDxxxx 4606 _mm_storel_pd ((double *)values, _mm_castsi128_pd(result)); 4607 // At this point, values[] should hold A,B,C,D 4608 #else 4609 SIMD_DO (values[i] = m_val[i]); 4610 #endif 4611 } 4612 4613 4614 4615 OIIO_FORCEINLINE void vint4::store (unsigned char *values) const { 4616 #if OIIO_AVX512VL_ENABLED 4617 _mm_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xf), m_simd); 4618 #elif OIIO_SIMD_SSE 4619 // Expressed as bytes and considering little endianness, we 4620 // currently have AxBxCxDx (the 'x' means don't care). 4621 vint4 clamped = m_simd & vint4(0xff); // A000 B000 C000 D000 4622 vint4 swapped = shuffle_sse<1,0,3,2>(clamped); // B000 A000 D000 C000 4623 vint4 shifted = swapped << 8; // 0B00 0A00 0D00 0C00 4624 vint4 merged = clamped | shifted; // AB00 xxxx CD00 xxxx 4625 vint4 merged2 = shuffle_sse<2,2,2,2>(merged); // CD00 ... 4626 vint4 shifted2 = merged2 << 16; // 00CD ... 4627 vint4 result = merged | shifted2; // ABCD ... 4628 *(int*)values = result[0]; //extract<0>(result); 4629 // At this point, values[] should hold A,B,C,D 4630 #else 4631 SIMD_DO (values[i] = m_val[i]); 4632 #endif 4633 } 4634 4635 4636 4637 4638 template<int i0, int i1, int i2, int i3> 4639 OIIO_FORCEINLINE vint4 shuffle (const vint4& a) { 4640 #if OIIO_SIMD_SSE 4641 return shuffle_sse<i0,i1,i2,i3> (__m128i(a)); 4642 #else 4643 return vint4(a[i0], a[i1], a[i2], a[i3]); 4644 #endif 4645 } 4646 4647 template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a) { return shuffle<i,i,i,i>(a); } 4648 4649 4650 template<int i> 4651 OIIO_FORCEINLINE int extract (const vint4& v) { 4652 #if OIIO_SIMD_SSE >= 4 4653 return _mm_extract_epi32(v.simd(), i); // SSE4.1 only 4654 #else 4655 return v[i]; 4656 #endif 4657 } 4658 4659 #if OIIO_SIMD_SSE 4660 template<> OIIO_FORCEINLINE int extract<0> (const vint4& v) { 4661 return _mm_cvtsi128_si32(v.simd()); 4662 } 4663 #endif 4664 4665 template<int i> 4666 OIIO_FORCEINLINE vint4 insert (const vint4& a, int val) { 4667 #if OIIO_SIMD_SSE >= 4 4668 return _mm_insert_epi32 (a.simd(), val, i); 4669 #else 4670 vint4 tmp = a; 4671 tmp[i] = val; 4672 return tmp; 4673 #endif 4674 } 4675 4676 4677 4678 OIIO_FORCEINLINE int vint4::x () const { return extract<0>(*this); } 4679 OIIO_FORCEINLINE int vint4::y () const { return extract<1>(*this); } 4680 OIIO_FORCEINLINE int vint4::z () const { return extract<2>(*this); } 4681 OIIO_FORCEINLINE int vint4::w () const { return extract<3>(*this); } 4682 OIIO_FORCEINLINE void vint4::set_x (int val) { *this = insert<0>(*this, val); } 4683 OIIO_FORCEINLINE void vint4::set_y (int val) { *this = insert<1>(*this, val); } 4684 OIIO_FORCEINLINE void vint4::set_z (int val) { *this = insert<2>(*this, val); } 4685 OIIO_FORCEINLINE void vint4::set_w (int val) { *this = insert<3>(*this, val); } 4686 4687 4688 OIIO_FORCEINLINE vint4 bitcast_to_int (const vbool4& x) 4689 { 4690 #if OIIO_SIMD_SSE 4691 return _mm_castps_si128 (x.simd()); 4692 #else 4693 return *(vint4 *)&x; 4694 #endif 4695 } 4696 4697 // Old names: (DEPRECATED 1.8) 4698 inline vint4 bitcast_to_int4 (const vbool4& x) { return bitcast_to_int(x); } 4699 4700 4701 OIIO_FORCEINLINE vint4 vreduce_add (const vint4& v) { 4702 #if OIIO_SIMD_SSE >= 3 4703 // People seem to agree that SSE3 does add reduction best with 2 4704 // horizontal adds. 4705 // suppose v = (a, b, c, d) 4706 simd::vint4 ab_cd = _mm_hadd_epi32 (v.simd(), v.simd()); 4707 // ab_cd = (a+b, c+d, a+b, c+d) 4708 simd::vint4 abcd = _mm_hadd_epi32 (ab_cd.simd(), ab_cd.simd()); 4709 // all abcd elements are a+b+c+d, return an element as fast as possible 4710 return abcd; 4711 #elif OIIO_SIMD_SSE >= 2 4712 // I think this is the best we can do for SSE2, and I'm still not sure 4713 // it's faster than the default scalar operation. But anyway... 4714 // suppose v = (a, b, c, d) 4715 vint4 ab_ab_cd_cd = shuffle<1,0,3,2>(v) + v; 4716 // ab_ab_cd_cd = (b,a,d,c) + (a,b,c,d) = (a+b,a+b,c+d,c+d) 4717 vint4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd); 4718 // cd_cd_ab_ab = (c+d,c+d,a+b,a+b) 4719 vint4 abcd = ab_ab_cd_cd + cd_cd_ab_ab; // a+b+c+d in all components 4720 return abcd; 4721 #else 4722 return vint4(reduce_add(v)); 4723 #endif 4724 } 4725 4726 4727 OIIO_FORCEINLINE int reduce_add (const vint4& v) { 4728 #if OIIO_SIMD_SSE 4729 return extract<0> (vreduce_add(v)); 4730 #elif OIIO_SIMD_NEON 4731 return vaddvq_s32(v); 4732 #else 4733 SIMD_RETURN_REDUCE (int, 0, r += v[i]); 4734 #endif 4735 } 4736 4737 4738 OIIO_FORCEINLINE int reduce_and (const vint4& v) { 4739 #if OIIO_SIMD_SSE 4740 vint4 ab = v & shuffle<1,1,3,3>(v); // ab bb cd dd 4741 vint4 abcd = ab & shuffle<2>(ab); 4742 return extract<0>(abcd); 4743 #else 4744 SIMD_RETURN_REDUCE (int, -1, r &= v[i]); 4745 #endif 4746 } 4747 4748 4749 OIIO_FORCEINLINE int reduce_or (const vint4& v) { 4750 #if OIIO_SIMD_SSE 4751 vint4 ab = v | shuffle<1,1,3,3>(v); // ab bb cd dd 4752 vint4 abcd = ab | shuffle<2>(ab); 4753 return extract<0>(abcd); 4754 #else 4755 SIMD_RETURN_REDUCE (int, 0, r |= v[i]); 4756 #endif 4757 } 4758 4759 4760 4761 OIIO_FORCEINLINE vint4 blend (const vint4& a, const vint4& b, const vbool4& mask) { 4762 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 4763 return _mm_castps_si128 (_mm_blendv_ps (_mm_castsi128_ps(a.simd()), 4764 _mm_castsi128_ps(b.simd()), mask)); 4765 #elif OIIO_SIMD_SSE 4766 return _mm_or_si128 (_mm_and_si128(_mm_castps_si128(mask.simd()), b.simd()), 4767 _mm_andnot_si128(_mm_castps_si128(mask.simd()), a.simd())); 4768 #elif OIIO_SIMD_NEON 4769 return vbslq_s32 (mask.simd(), b.simd(), a.simd()); 4770 #else 4771 SIMD_RETURN (vint4, mask[i] ? b[i] : a[i]); 4772 #endif 4773 } 4774 4775 OIIO_FORCEINLINE vint4 blend0 (const vint4& a, const vbool4& mask) { 4776 #if OIIO_SIMD_SSE 4777 return _mm_and_si128(_mm_castps_si128(mask), a.simd()); 4778 #else 4779 SIMD_RETURN (vint4, mask[i] ? a[i] : 0.0f); 4780 #endif 4781 } 4782 4783 4784 OIIO_FORCEINLINE vint4 blend0not (const vint4& a, const vbool4& mask) { 4785 #if OIIO_SIMD_SSE 4786 return _mm_andnot_si128(_mm_castps_si128(mask), a.simd()); 4787 #else 4788 SIMD_RETURN (vint4, mask[i] ? 0.0f : a[i]); 4789 #endif 4790 } 4791 4792 4793 OIIO_FORCEINLINE vint4 select (const vbool4& mask, const vint4& a, const vint4& b) { 4794 return blend (b, a, mask); 4795 } 4796 4797 4798 4799 OIIO_FORCEINLINE vint4 abs (const vint4& a) { 4800 #if OIIO_SIMD_SSE >= 3 4801 return _mm_abs_epi32(a.simd()); 4802 #elif OIIO_SIMD_NEON 4803 return vabsq_s32(a.simd()); 4804 #else 4805 SIMD_RETURN (vint4, std::abs(a[i])); 4806 #endif 4807 } 4808 4809 4810 4811 OIIO_FORCEINLINE vint4 min (const vint4& a, const vint4& b) { 4812 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 4813 return _mm_min_epi32 (a, b); 4814 #elif OIIO_SIMD_NEON 4815 return vminq_s32(a, b); 4816 #else 4817 SIMD_RETURN (vint4, std::min(a[i], b[i])); 4818 #endif 4819 } 4820 4821 4822 OIIO_FORCEINLINE vint4 max (const vint4& a, const vint4& b) { 4823 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 4824 return _mm_max_epi32 (a, b); 4825 #elif OIIO_SIMD_NEON 4826 return vmaxq_s32(a, b); 4827 #else 4828 SIMD_RETURN (vint4, std::max(a[i], b[i])); 4829 #endif 4830 } 4831 4832 4833 OIIO_FORCEINLINE vint4 rotl(const vint4& x, int s) { 4834 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 4835 // return _mm_rol_epi32 (x, s); 4836 // We want to do this ^^^ but this intrinsic only takes an *immediate* 4837 // argument for s, and there isn't a way to express in C++ that a 4838 // parameter must be an immediate/literal value from the caller. 4839 return (x<<s) | srl(x,32-s); 4840 #else 4841 return (x<<s) | srl(x,32-s); 4842 #endif 4843 } 4844 4845 // DEPRECATED (2.1) 4846 OIIO_FORCEINLINE vint4 rotl32 (const vint4& x, const unsigned int k) { 4847 return rotl(x, k); 4848 } 4849 4850 4851 OIIO_FORCEINLINE vint4 andnot (const vint4& a, const vint4& b) { 4852 #if OIIO_SIMD_SSE 4853 return _mm_andnot_si128 (a.simd(), b.simd()); 4854 #else 4855 SIMD_RETURN (vint4, ~(a[i]) & b[i]); 4856 #endif 4857 } 4858 4859 4860 // Implementation had to be after the definition of vint4::Zero. 4861 OIIO_FORCEINLINE vbool4::vbool4 (const vint4& ival) { 4862 m_simd = (ival != vint4::Zero()); 4863 } 4864 4865 4866 4867 OIIO_FORCEINLINE vint4 safe_mod (const vint4& a, const vint4& b) { 4868 // NO INTEGER MODULUS IN SSE! 4869 SIMD_RETURN (vint4, b[i] ? a[i] % b[i] : 0); 4870 } 4871 4872 OIIO_FORCEINLINE vint4 safe_mod (const vint4& a, int b) { 4873 return b ? (a % b) : vint4::Zero(); 4874 } 4875 4876 4877 4878 4879 ////////////////////////////////////////////////////////////////////// 4880 // vint8 implementation 4881 4882 OIIO_FORCEINLINE const vint8 & vint8::operator= (const vint8& other) { 4883 m_simd = other.m_simd; 4884 return *this; 4885 } 4886 4887 OIIO_FORCEINLINE int vint8::operator[] (int i) const { 4888 OIIO_DASSERT(i<elements); 4889 return m_val[i]; 4890 } 4891 4892 OIIO_FORCEINLINE int& vint8::operator[] (int i) { 4893 OIIO_DASSERT(i<elements); 4894 return m_val[i]; 4895 } 4896 4897 OIIO_FORCEINLINE void vint8::setcomp (int i, int val) { 4898 OIIO_DASSERT(i<elements); 4899 m_val[i] = val; 4900 } 4901 4902 4903 OIIO_FORCEINLINE void vint8::load (int a) { 4904 #if OIIO_SIMD_AVX 4905 m_simd = _mm256_set1_epi32 (a); 4906 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 4907 m_4[0].load(a); 4908 m_4[1].load(a); 4909 #else 4910 SIMD_CONSTRUCT (a); 4911 #endif 4912 } 4913 4914 4915 OIIO_FORCEINLINE void vint8::load (int a, int b, int c, int d, 4916 int e, int f, int g, int h) { 4917 #if OIIO_SIMD_AVX 4918 m_simd = _mm256_set_epi32 (h, g, f, e, d, c, b, a); 4919 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 4920 m_4[0].load(a, b, c, d); 4921 m_4[1].load(e, f, g, h); 4922 #else 4923 m_val[0] = a; 4924 m_val[1] = b; 4925 m_val[2] = c; 4926 m_val[3] = d; 4927 m_val[4] = e; 4928 m_val[5] = f; 4929 m_val[6] = g; 4930 m_val[7] = h; 4931 #endif 4932 } 4933 4934 4935 OIIO_FORCEINLINE void vint8::load (const int *values) { 4936 #if OIIO_SIMD_AVX 4937 m_simd = _mm256_loadu_si256 ((const simd_t *)values); 4938 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 4939 m_4[0].load(values); 4940 m_4[1].load(values+4); 4941 #else 4942 SIMD_CONSTRUCT (values[i]); 4943 #endif 4944 } 4945 4946 4947 OIIO_FORCEINLINE void vint8::load (const int *values, int n) 4948 { 4949 OIIO_DASSERT (n >= 0 && n <= elements); 4950 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 4951 m_simd = _mm256_maskz_loadu_epi32 ((~(0xff << n)), values); 4952 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 4953 if (n > 4) { 4954 vint4 lo, hi; 4955 lo.load (values); 4956 hi.load (values+4, n-4); 4957 m_4[0] = lo; 4958 m_4[1] = hi; 4959 } else { 4960 vint4 lo, hi; 4961 lo.load (values, n); 4962 hi.clear(); 4963 m_4[0] = lo; 4964 m_4[1] = hi; 4965 } 4966 #else 4967 for (int i = 0; i < n; ++i) 4968 m_val[i] = values[i]; 4969 for (int i = n; i < elements; ++i) 4970 m_val[i] = 0; 4971 #endif 4972 } 4973 4974 4975 OIIO_FORCEINLINE void vint8::load (const short *values) { 4976 #if OIIO_SIMD_AVX >= 2 4977 m_simd = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)values)); 4978 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 4979 m_4[0].load(values); 4980 m_4[1].load(values+4); 4981 #else 4982 SIMD_CONSTRUCT (values[i]); 4983 #endif 4984 } 4985 4986 OIIO_FORCEINLINE void vint8::load (const unsigned short *values) { 4987 #if OIIO_SIMD_AVX >= 2 4988 m_simd = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)values)); 4989 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 4990 m_4[0].load(values); 4991 m_4[1].load(values+4); 4992 #else 4993 SIMD_CONSTRUCT (values[i]); 4994 #endif 4995 } 4996 4997 4998 OIIO_FORCEINLINE void vint8::load (const char *values) { 4999 #if OIIO_SIMD_AVX >= 2 5000 __m128i bytes = _mm_castpd_si128 (_mm_load_sd ((const double *)values)); 5001 m_simd = _mm256_cvtepi8_epi32 (bytes); 5002 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 5003 m_4[0].load(values); 5004 m_4[1].load(values+4); 5005 #else 5006 SIMD_CONSTRUCT (values[i]); 5007 #endif 5008 } 5009 5010 OIIO_FORCEINLINE void vint8::load (const unsigned char *values) { 5011 #if OIIO_SIMD_AVX >= 2 5012 __m128i bytes = _mm_castpd_si128 (_mm_load_sd ((const double *)values)); 5013 m_simd = _mm256_cvtepu8_epi32 (bytes); 5014 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 5015 m_4[0].load(values); 5016 m_4[1].load(values+4); 5017 #else 5018 SIMD_CONSTRUCT (values[i]); 5019 #endif 5020 } 5021 5022 5023 5024 OIIO_FORCEINLINE vint8::vint8 (int a) { load(a); } 5025 5026 OIIO_FORCEINLINE vint8::vint8 (int a, int b, int c, int d, 5027 int e, int f, int g, int h) { 5028 load(a,b,c,d,e,f,g,h); 5029 } 5030 5031 OIIO_FORCEINLINE vint8::vint8 (const int *vals) { load (vals); } 5032 OIIO_FORCEINLINE vint8::vint8 (const unsigned short *vals) { load(vals); } 5033 OIIO_FORCEINLINE vint8::vint8 (const short *vals) { load(vals); } 5034 OIIO_FORCEINLINE vint8::vint8 (const unsigned char *vals) { load(vals); } 5035 OIIO_FORCEINLINE vint8::vint8 (const char *vals) { load(vals); } 5036 5037 OIIO_FORCEINLINE const vint8 & vint8::operator= (int a) { load(a); return *this; } 5038 5039 5040 OIIO_FORCEINLINE void vint8::store (int *values) const { 5041 #if OIIO_SIMD_AVX 5042 // Use an unaligned store -- it's just as fast when the memory turns 5043 // out to be aligned, nearly as fast even when unaligned. Not worth 5044 // the headache of using stores that require alignment. 5045 _mm256_storeu_si256 ((simd_t *)values, m_simd); 5046 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 5047 m_4[0].store(values); 5048 m_4[1].store(values+4); 5049 #else 5050 SIMD_DO (values[i] = m_val[i]); 5051 #endif 5052 } 5053 5054 5055 OIIO_FORCEINLINE void vint8::load_mask (int mask, const int *values) { 5056 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 5057 m_simd = _mm256_maskz_loadu_epi32 (__mmask8(mask), (const simd_t *)values); 5058 #elif OIIO_SIMD_AVX >= 2 5059 m_simd = _mm256_maskload_epi32 (values, _mm256_castps_si256(vbool8::from_bitmask(mask))); 5060 #else 5061 SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0); 5062 #endif 5063 } 5064 5065 5066 OIIO_FORCEINLINE void vint8::load_mask (const vbool8& mask, const int *values) { 5067 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 5068 m_simd = _mm256_maskz_loadu_epi32 (__mmask8(mask.bitmask()), (const simd_t *)values); 5069 #elif OIIO_SIMD_AVX >= 2 5070 m_simd = _mm256_maskload_epi32 (values, _mm256_castps_si256(mask)); 5071 #else 5072 SIMD_CONSTRUCT (mask[i] ? values[i] : 0); 5073 #endif 5074 } 5075 5076 5077 OIIO_FORCEINLINE void vint8::store_mask (int mask, int *values) const { 5078 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 5079 _mm256_mask_storeu_epi32 (values, __mmask8(mask), m_simd); 5080 #elif OIIO_SIMD_AVX >= 2 5081 _mm256_maskstore_epi32 (values, _mm256_castps_si256(vbool8::from_bitmask(mask)), m_simd); 5082 #else 5083 SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]); 5084 #endif 5085 } 5086 5087 5088 OIIO_FORCEINLINE void vint8::store_mask (const vbool8& mask, int *values) const { 5089 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 5090 _mm256_mask_storeu_epi32 (values, __mmask8(mask.bitmask()), m_simd); 5091 #elif OIIO_SIMD_AVX >= 2 5092 _mm256_maskstore_epi32 (values, _mm256_castps_si256(mask), m_simd); 5093 #else 5094 SIMD_DO (if (mask[i]) values[i] = (*this)[i]); 5095 #endif 5096 } 5097 5098 5099 template <int scale> 5100 OIIO_FORCEINLINE void 5101 vint8::gather (const value_t *baseptr, const vint_t& vindex) 5102 { 5103 #if OIIO_SIMD_AVX >= 2 5104 m_simd = _mm256_i32gather_epi32 (baseptr, vindex, scale); 5105 #else 5106 SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale)); 5107 #endif 5108 } 5109 5110 template<int scale> 5111 OIIO_FORCEINLINE void 5112 vint8::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex) 5113 { 5114 #if OIIO_SIMD_AVX >= 2 5115 m_simd = _mm256_mask_i32gather_epi32 (m_simd, baseptr, vindex, _mm256_cvtps_epi32(mask), scale); 5116 #else 5117 SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0); 5118 #endif 5119 } 5120 5121 template<int scale> 5122 OIIO_FORCEINLINE void 5123 vint8::scatter (value_t *baseptr, const vint_t& vindex) const 5124 { 5125 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 5126 _mm256_i32scatter_epi32 (baseptr, vindex, m_simd, scale); 5127 #else 5128 SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]); 5129 #endif 5130 } 5131 5132 template<int scale> 5133 OIIO_FORCEINLINE void 5134 vint8::scatter_mask (const bool_t& mask, value_t *baseptr, 5135 const vint_t& vindex) const 5136 { 5137 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 5138 _mm256_mask_i32scatter_epi32 (baseptr, mask.bitmask(), vindex, m_simd, scale); 5139 #else 5140 SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]); 5141 #endif 5142 } 5143 5144 5145 OIIO_FORCEINLINE void vint8::clear () { 5146 #if OIIO_SIMD_AVX 5147 m_simd = _mm256_setzero_si256(); 5148 #else 5149 *this = 0; 5150 #endif 5151 } 5152 5153 5154 OIIO_FORCEINLINE const vint8 vint8::Zero () { 5155 #if OIIO_SIMD_AVX 5156 return _mm256_setzero_si256(); 5157 #else 5158 return 0; 5159 #endif 5160 } 5161 5162 OIIO_FORCEINLINE const vint8 vint8::One () { return vint8(1); } 5163 5164 OIIO_FORCEINLINE const vint8 vint8::NegOne () { return vint8(-1); } 5165 5166 5167 OIIO_FORCEINLINE const vint8 vint8::Iota (int start, int step) { 5168 return vint8 (start+0*step, start+1*step, start+2*step, start+3*step, 5169 start+4*step, start+5*step, start+6*step, start+7*step); 5170 } 5171 5172 5173 OIIO_FORCEINLINE const vint8 vint8::Giota () { 5174 return vint8 (1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7); 5175 } 5176 5177 5178 OIIO_FORCEINLINE vint4 vint8::lo () const { 5179 #if OIIO_SIMD_AVX 5180 return _mm256_castsi256_si128 (simd()); 5181 #else 5182 return m_4[0]; 5183 #endif 5184 } 5185 5186 OIIO_FORCEINLINE vint4 vint8::hi () const { 5187 #if OIIO_SIMD_AVX 5188 return _mm256_extractf128_si256 (simd(), 1); 5189 #else 5190 return m_4[1]; 5191 #endif 5192 } 5193 5194 5195 OIIO_FORCEINLINE vint8::vint8 (const vint4& lo, const vint4 &hi) { 5196 #if OIIO_SIMD_AVX 5197 __m256i r = _mm256_castsi128_si256 (lo); 5198 m_simd = _mm256_insertf128_si256 (r, hi, 1); 5199 // N.B. equivalent, if available: m_simd = _mm256_set_m128i (hi, lo); 5200 // FIXME: when would this not be available? 5201 #else 5202 m_4[0] = lo; 5203 m_4[1] = hi; 5204 #endif 5205 } 5206 5207 5208 OIIO_FORCEINLINE vint8 operator+ (const vint8& a, const vint8& b) { 5209 #if OIIO_SIMD_AVX >= 2 5210 return _mm256_add_epi32 (a.simd(), b.simd()); 5211 #else 5212 SIMD_RETURN (vint8, a[i] + b[i]); 5213 #endif 5214 } 5215 5216 5217 OIIO_FORCEINLINE const vint8& operator+= (vint8& a, const vint8& b) { 5218 return a = a + b; 5219 } 5220 5221 5222 OIIO_FORCEINLINE vint8 operator- (const vint8& a) { 5223 #if OIIO_SIMD_AVX >= 2 5224 return _mm256_sub_epi32 (_mm256_setzero_si256(), a); 5225 #else 5226 SIMD_RETURN (vint8, -a[i]); 5227 #endif 5228 } 5229 5230 5231 OIIO_FORCEINLINE vint8 operator- (const vint8& a, const vint8& b) { 5232 #if OIIO_SIMD_AVX >= 2 5233 return _mm256_sub_epi32 (a.simd(), b.simd()); 5234 #else 5235 SIMD_RETURN (vint8, a[i] - b[i]); 5236 #endif 5237 } 5238 5239 5240 OIIO_FORCEINLINE const vint8 &operator-= (vint8& a, const vint8& b) { 5241 return a = a - b; 5242 } 5243 5244 5245 OIIO_FORCEINLINE vint8 operator* (const vint8& a, const vint8& b) { 5246 #if OIIO_SIMD_AVX >= 2 5247 return _mm256_mullo_epi32 (a.simd(), b.simd()); 5248 #else 5249 SIMD_RETURN (vint8, a[i] * b[i]); 5250 #endif 5251 } 5252 5253 5254 OIIO_FORCEINLINE const vint8& operator*= (vint8& a, const vint8& b) { return a = a * b; } 5255 OIIO_FORCEINLINE const vint8& operator*= (vint8& a, int b) { return a = a * b; } 5256 5257 5258 OIIO_FORCEINLINE vint8 operator/ (const vint8& a, const vint8& b) { 5259 // NO INTEGER DIVISION IN SSE or AVX! 5260 SIMD_RETURN (vint8, a[i] / b[i]); 5261 } 5262 5263 OIIO_FORCEINLINE const vint8& operator/= (vint8& a, const vint8& b) { return a = a / b; } 5264 5265 5266 OIIO_FORCEINLINE vint8 operator% (const vint8& a, const vint8& b) { 5267 // NO INTEGER MODULUS IN SSE or AVX! 5268 SIMD_RETURN (vint8, a[i] % b[i]); 5269 } 5270 5271 OIIO_FORCEINLINE const vint8& operator%= (vint8& a, const vint8& b) { return a = a % b; } 5272 5273 OIIO_FORCEINLINE vint8 operator% (const vint8& a, int w) { 5274 // NO INTEGER MODULUS in SSE or AVX! 5275 SIMD_RETURN (vint8, a[i] % w); 5276 } 5277 5278 OIIO_FORCEINLINE const vint8& operator%= (vint8& a, int b) { return a = a % b; } 5279 5280 5281 OIIO_FORCEINLINE vint8 operator& (const vint8& a, const vint8& b) { 5282 #if OIIO_SIMD_AVX >= 2 5283 return _mm256_and_si256 (a.simd(), b.simd()); 5284 #else 5285 SIMD_RETURN (vint8, a[i] & b[i]); 5286 #endif 5287 } 5288 5289 OIIO_FORCEINLINE const vint8& operator&= (vint8& a, const vint8& b) { return a = a & b; } 5290 5291 OIIO_FORCEINLINE vint8 operator| (const vint8& a, const vint8& b) { 5292 #if OIIO_SIMD_AVX >= 2 5293 return _mm256_or_si256 (a.simd(), b.simd()); 5294 #else 5295 SIMD_RETURN (vint8, a[i] | b[i]); 5296 #endif 5297 } 5298 5299 OIIO_FORCEINLINE const vint8& operator|= (vint8& a, const vint8& b) { return a = a | b; } 5300 5301 OIIO_FORCEINLINE vint8 operator^ (const vint8& a, const vint8& b) { 5302 #if OIIO_SIMD_AVX >= 2 5303 return _mm256_xor_si256 (a.simd(), b.simd()); 5304 #else 5305 SIMD_RETURN (vint8, a[i] ^ b[i]); 5306 #endif 5307 } 5308 5309 OIIO_FORCEINLINE const vint8& operator^= (vint8& a, const vint8& b) { return a = a ^ b; } 5310 5311 5312 OIIO_FORCEINLINE vint8 operator~ (const vint8& a) { 5313 #if OIIO_SIMD_AVX >= 2 5314 return a ^ a.NegOne(); 5315 #else 5316 SIMD_RETURN (vint8, ~a[i]); 5317 #endif 5318 } 5319 5320 5321 OIIO_FORCEINLINE vint8 operator<< (const vint8& a, unsigned int bits) { 5322 #if OIIO_SIMD_AVX >= 2 5323 return _mm256_slli_epi32 (a, bits); 5324 #elif OIIO_SIMD_SSE 5325 return vint8 (a.lo() << bits, a.hi() << bits); 5326 #else 5327 SIMD_RETURN (vint8, a[i] << bits); 5328 #endif 5329 } 5330 5331 5332 OIIO_FORCEINLINE const vint8& operator<<= (vint8& a, const unsigned int bits) { 5333 return a = a << bits; 5334 } 5335 5336 OIIO_FORCEINLINE vint8 operator>> (const vint8& a, const unsigned int bits) { 5337 #if OIIO_SIMD_AVX >= 2 5338 return _mm256_srai_epi32 (a, bits); 5339 #elif OIIO_SIMD_SSE 5340 return vint8 (a.lo() >> bits, a.hi() >> bits); 5341 #else 5342 SIMD_RETURN (vint8, a[i] >> bits); 5343 #endif 5344 } 5345 5346 OIIO_FORCEINLINE const vint8& operator>>= (vint8& a, const unsigned int bits) { 5347 return a = a >> bits; 5348 } 5349 5350 5351 OIIO_FORCEINLINE vint8 srl (const vint8& a, const unsigned int bits) { 5352 #if OIIO_SIMD_AVX >= 2 5353 return _mm256_srli_epi32 (a, bits); 5354 #else 5355 SIMD_RETURN (vint8, int ((unsigned int)(a[i]) >> bits)); 5356 #endif 5357 } 5358 5359 5360 OIIO_FORCEINLINE vbool8 operator== (const vint8& a, const vint8& b) { 5361 // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ? 5362 #if OIIO_SIMD_AVX >= 2 5363 return _mm256_castsi256_ps(_mm256_cmpeq_epi32 (a.m_simd, b.m_simd)); 5364 #elif OIIO_SIMD_SSE /* Fall back to 4-wide */ 5365 return vbool8 (a.lo() == b.lo(), a.hi() == b.hi()); 5366 #else 5367 SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0); 5368 #endif 5369 } 5370 5371 5372 OIIO_FORCEINLINE vbool8 operator!= (const vint8& a, const vint8& b) { 5373 // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ? 5374 return ! (a == b); 5375 } 5376 5377 5378 OIIO_FORCEINLINE vbool8 operator> (const vint8& a, const vint8& b) { 5379 // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ? 5380 #if OIIO_SIMD_AVX >= 2 5381 return _mm256_castsi256_ps(_mm256_cmpgt_epi32 (a, b)); 5382 #elif OIIO_SIMD_SSE /* Fall back to 4-wide */ 5383 return vbool8 (a.lo() > b.lo(), a.hi() > b.hi()); 5384 #else 5385 SIMD_RETURN (vbool8, a[i] > b[i] ? -1 : 0); 5386 #endif 5387 } 5388 5389 5390 OIIO_FORCEINLINE vbool8 operator< (const vint8& a, const vint8& b) { 5391 // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ? 5392 #if OIIO_SIMD_AVX >= 2 5393 // No lt or lte! 5394 return (b > a); 5395 #elif OIIO_SIMD_SSE /* Fall back to 4-wide */ 5396 return vbool8 (a.lo() < b.lo(), a.hi() < b.hi()); 5397 #else 5398 SIMD_RETURN (vbool8, a[i] < b[i] ? -1 : 0); 5399 #endif 5400 } 5401 5402 5403 OIIO_FORCEINLINE vbool8 operator>= (const vint8& a, const vint8& b) { 5404 // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ? 5405 return (a > b) | (a == b); 5406 } 5407 5408 5409 OIIO_FORCEINLINE vbool8 operator<= (const vint8& a, const vint8& b) { 5410 // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ? 5411 return (b > a) | (a == b); 5412 } 5413 5414 5415 inline std::ostream& operator<< (std::ostream& cout, const vint8& val) { 5416 cout << val[0]; 5417 for (int i = 1; i < val.elements; ++i) 5418 cout << ' ' << val[i]; 5419 return cout; 5420 } 5421 5422 5423 OIIO_FORCEINLINE void vint8::store (int *values, int n) const { 5424 OIIO_DASSERT (n >= 0 && n <= elements); 5425 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 5426 // This SHOULD be fast, but in my benchmarks, it is slower! 5427 // (At least on the AVX512 hardware I have, Xeon Silver 4110.) 5428 // Re-test this periodically with new Intel hardware. 5429 _mm256_mask_storeu_epi32 (values, __mmask8(~(0xff << n)), m_simd); 5430 #elif OIIO_SIMD_SSE 5431 if (n <= 4) { 5432 lo().store (values, n); 5433 } else if (n < 8) { 5434 lo().store (values); 5435 hi().store (values+4, n-4); 5436 } else { 5437 store (values); 5438 } 5439 #else 5440 for (int i = 0; i < n; ++i) 5441 values[i] = m_val[i]; 5442 #endif 5443 } 5444 5445 5446 // FIXME(AVX): fast vint8 store to unsigned short, unsigned char 5447 5448 OIIO_FORCEINLINE void vint8::store (unsigned short *values) const { 5449 #if OIIO_AVX512VL_ENABLED 5450 _mm256_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xff), m_simd); 5451 #elif OIIO_SIMD_SSE 5452 lo().store (values); 5453 hi().store (values+4); 5454 #else 5455 SIMD_DO (values[i] = m_val[i]); 5456 #endif 5457 } 5458 5459 5460 OIIO_FORCEINLINE void vint8::store (unsigned char *values) const { 5461 #if OIIO_AVX512VL_ENABLED 5462 _mm256_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xff), m_simd); 5463 #elif OIIO_SIMD_SSE 5464 lo().store (values); 5465 hi().store (values+4); 5466 #else 5467 SIMD_DO (values[i] = m_val[i]); 5468 #endif 5469 } 5470 5471 5472 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 5473 OIIO_FORCEINLINE vint8 shuffle (const vint8& a) { 5474 #if OIIO_SIMD_AVX >= 2 5475 vint8 index (i0, i1, i2, i3, i4, i5, i6, i7); 5476 return _mm256_castps_si256 (_mm256_permutevar8x32_ps (_mm256_castsi256_ps(a.simd()), index.simd())); 5477 #else 5478 return vint8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]); 5479 #endif 5480 } 5481 5482 template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a) { 5483 return shuffle<i,i,i,i,i,i,i,i>(a); 5484 } 5485 5486 5487 template<int i> 5488 OIIO_FORCEINLINE int extract (const vint8& v) { 5489 #if OIIO_SIMD_AVX && !_WIN32 5490 return _mm256_extract_epi32(v.simd(), i); 5491 #else 5492 return v[i]; 5493 #endif 5494 } 5495 5496 5497 template<int i> 5498 OIIO_FORCEINLINE vint8 insert (const vint8& a, int val) { 5499 #if OIIO_SIMD_AVX && !_WIN32 5500 return _mm256_insert_epi32 (a.simd(), val, i); 5501 #else 5502 vint8 tmp = a; 5503 tmp[i] = val; 5504 return tmp; 5505 #endif 5506 } 5507 5508 5509 OIIO_FORCEINLINE int vint8::x () const { return extract<0>(*this); } 5510 OIIO_FORCEINLINE int vint8::y () const { return extract<1>(*this); } 5511 OIIO_FORCEINLINE int vint8::z () const { return extract<2>(*this); } 5512 OIIO_FORCEINLINE int vint8::w () const { return extract<3>(*this); } 5513 OIIO_FORCEINLINE void vint8::set_x (int val) { *this = insert<0>(*this, val); } 5514 OIIO_FORCEINLINE void vint8::set_y (int val) { *this = insert<1>(*this, val); } 5515 OIIO_FORCEINLINE void vint8::set_z (int val) { *this = insert<2>(*this, val); } 5516 OIIO_FORCEINLINE void vint8::set_w (int val) { *this = insert<3>(*this, val); } 5517 5518 5519 OIIO_FORCEINLINE vint8 bitcast_to_int (const vbool8& x) 5520 { 5521 #if OIIO_SIMD_AVX 5522 return _mm256_castps_si256 (x.simd()); 5523 #else 5524 return *(vint8 *)&x; 5525 #endif 5526 } 5527 5528 5529 OIIO_FORCEINLINE vint8 vreduce_add (const vint8& v) { 5530 #if OIIO_SIMD_AVX >= 2 5531 // From Syrah: 5532 vint8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_epi32(v.simd(), _mm256_setzero_si256()); 5533 vint8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_epi32(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_si256()); 5534 // get efgh in the 0-idx slot 5535 vint8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0); 5536 vint8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh; 5537 return shuffle<0>(final_sum); 5538 #elif OIIO_SIMD_SSE 5539 vint4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi()); 5540 return vint8(hadd4, hadd4); 5541 #else 5542 return vint8(reduce_add(v)); 5543 #endif 5544 } 5545 5546 5547 OIIO_FORCEINLINE int reduce_add (const vint8& v) { 5548 #if OIIO_SIMD_SSE 5549 return extract<0> (vreduce_add(v)); 5550 #else 5551 return reduce_add(v.lo()) + reduce_add(v.hi()); 5552 #endif 5553 } 5554 5555 5556 OIIO_FORCEINLINE int reduce_and (const vint8& v) { 5557 #if OIIO_SSE_AVX >= 2 5558 vint8 ab = v & shuffle<1,1,3,3,5,5,7,7>(v); // ab bb cd dd ef ff gh hh 5559 vint8 abcd = ab & shuffle<2,2,2,2,6,6,6,6>(ab); // abcd x x x efgh x x x 5560 vint8 abcdefgh = abcd & shuffle<4>(abcdefgh); // abcdefgh x x x x x x x 5561 return extract<0> (abcdefgh); 5562 #else 5563 // AVX 1.0 or less -- use SSE 5564 return reduce_and(v.lo() & v.hi()); 5565 #endif 5566 } 5567 5568 5569 OIIO_FORCEINLINE int reduce_or (const vint8& v) { 5570 #if OIIO_SSE_AVX >= 2 5571 vint8 ab = v | shuffle<1,1,3,3,5,5,7,7>(v); // ab bb cd dd ef ff gh hh 5572 vint8 abcd = ab | shuffle<2,2,2,2,6,6,6,6>(ab); // abcd x x x efgh x x x 5573 vint8 abcdefgh = abcd | shuffle<4>(abcdefgh); // abcdefgh x x x x x x x 5574 return extract<0> (abcdefgh); 5575 #else 5576 // AVX 1.0 or less -- use SSE 5577 return reduce_or(v.lo() | v.hi()); 5578 #endif 5579 } 5580 5581 5582 OIIO_FORCEINLINE vint8 blend (const vint8& a, const vint8& b, const vbool8& mask) { 5583 #if OIIO_SIMD_AVX 5584 return _mm256_castps_si256 (_mm256_blendv_ps (_mm256_castsi256_ps(a.simd()), 5585 _mm256_castsi256_ps(b.simd()), mask)); 5586 #elif OIIO_SIMD_SSE 5587 return vint8 (blend(a.lo(), b.lo(), mask.lo()), 5588 blend(a.hi(), b.hi(), mask.hi())); 5589 #else 5590 SIMD_RETURN (vint8, mask[i] ? b[i] : a[i]); 5591 #endif 5592 } 5593 5594 5595 OIIO_FORCEINLINE vint8 blend0 (const vint8& a, const vbool8& mask) { 5596 // FIXME: More efficient for AVX-512 to use 5597 // _mm256_maxkz_mov_epi32(_mm256_movemask_ps(maxk),a))? 5598 #if OIIO_SIMD_AVX 5599 return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a.simd()), mask)); 5600 #elif OIIO_SIMD_SSE 5601 return vint8 (blend0(a.lo(), mask.lo()), 5602 blend0(a.hi(), mask.hi())); 5603 #else 5604 SIMD_RETURN (vint8, mask[i] ? a[i] : 0.0f); 5605 #endif 5606 } 5607 5608 5609 OIIO_FORCEINLINE vint8 blend0not (const vint8& a, const vbool8& mask) { 5610 // FIXME: More efficient for AVX-512 to use 5611 // _mm256_maxkz_mov_epi32(_mm256_movemask_ps(!maxk),a))? 5612 #if OIIO_SIMD_AVX 5613 return _mm256_castps_si256 (_mm256_andnot_ps (mask.simd(), _mm256_castsi256_ps(a.simd()))); 5614 #elif OIIO_SIMD_SSE 5615 return vint8 (blend0not(a.lo(), mask.lo()), 5616 blend0not(a.hi(), mask.hi())); 5617 #else 5618 SIMD_RETURN (vint8, mask[i] ? 0.0f : a[i]); 5619 #endif 5620 } 5621 5622 OIIO_FORCEINLINE vint8 select (const vbool8& mask, const vint8& a, const vint8& b) { 5623 return blend (b, a, mask); 5624 } 5625 5626 5627 OIIO_FORCEINLINE vint8 abs (const vint8& a) { 5628 #if OIIO_SIMD_AVX >= 2 5629 return _mm256_abs_epi32(a.simd()); 5630 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 5631 return vint8(abs(a.lo()), abs(a.hi())); 5632 #else 5633 SIMD_RETURN (vint8, std::abs(a[i])); 5634 #endif 5635 } 5636 5637 5638 OIIO_FORCEINLINE vint8 min (const vint8& a, const vint8& b) { 5639 #if OIIO_SIMD_AVX >= 2 5640 return _mm256_min_epi32 (a, b); 5641 #else 5642 return vint8 (min(a.lo(), b.lo()), min(a.hi(), b.hi())); 5643 #endif 5644 } 5645 5646 5647 OIIO_FORCEINLINE vint8 max (const vint8& a, const vint8& b) { 5648 #if OIIO_SIMD_AVX >= 2 5649 return _mm256_max_epi32 (a, b); 5650 #else 5651 return vint8 (max(a.lo(), b.lo()), max(a.hi(), b.hi())); 5652 #endif 5653 } 5654 5655 5656 OIIO_FORCEINLINE vint8 rotl(const vint8& x, int s) { 5657 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 5658 // return _mm256_rol_epi32 (x, s); 5659 // We want to do this ^^^ but this intrinsic only takes an *immediate* 5660 // argument for s, and there isn't a way to express in C++ that a 5661 // parameter must be an immediate/literal value from the caller. 5662 return (x<<s) | srl(x,32-s); 5663 #else 5664 return (x<<s) | srl(x,32-s); 5665 #endif 5666 } 5667 5668 // DEPRECATED (2.1) 5669 OIIO_FORCEINLINE vint8 rotl32 (const vint8& x, const unsigned int k) { 5670 return rotl(x, k); 5671 } 5672 5673 5674 OIIO_FORCEINLINE vint8 andnot (const vint8& a, const vint8& b) { 5675 #if OIIO_SIMD_AVX >= 2 5676 return _mm256_andnot_si256 (a.simd(), b.simd()); 5677 #elif OIIO_SIMD_AVX >= 1 5678 return _mm256_castps_si256 (_mm256_andnot_ps (_mm256_castsi256_ps(a.simd()), _mm256_castsi256_ps(b.simd()))); 5679 #else 5680 SIMD_RETURN (vint8, ~(a[i]) & b[i]); 5681 #endif 5682 } 5683 5684 5685 // Implementation had to be after the definition of vint8::Zero. 5686 OIIO_FORCEINLINE vbool8::vbool8 (const vint8& ival) { 5687 m_simd = (ival != vint8::Zero()); 5688 } 5689 5690 5691 5692 OIIO_FORCEINLINE vint8 safe_mod (const vint8& a, const vint8& b) { 5693 // NO INTEGER MODULUS IN SSE! 5694 SIMD_RETURN (vint8, b[i] ? a[i] % b[i] : 0); 5695 } 5696 5697 OIIO_FORCEINLINE vint8 safe_mod (const vint8& a, int b) { 5698 return b ? (a % b) : vint8::Zero(); 5699 } 5700 5701 5702 5703 5704 ////////////////////////////////////////////////////////////////////// 5705 // vint16 implementation 5706 5707 OIIO_FORCEINLINE const vint16 & vint16::operator= (const vint16& other) { 5708 m_simd = other.m_simd; 5709 return *this; 5710 } 5711 5712 OIIO_FORCEINLINE int vint16::operator[] (int i) const { 5713 OIIO_DASSERT(i<elements); 5714 return m_val[i]; 5715 } 5716 5717 OIIO_FORCEINLINE int& vint16::operator[] (int i) { 5718 OIIO_DASSERT(i<elements); 5719 return m_val[i]; 5720 } 5721 5722 OIIO_FORCEINLINE void vint16::setcomp (int i, int val) { 5723 OIIO_DASSERT(i<elements); 5724 m_val[i] = val; 5725 } 5726 5727 5728 OIIO_FORCEINLINE void vint16::load (int a) { 5729 #if OIIO_SIMD_AVX >= 512 5730 m_simd = _mm512_set1_epi32 (a); 5731 #else 5732 m_8[0].load (a); 5733 m_8[1].load (a); 5734 #endif 5735 } 5736 5737 5738 OIIO_FORCEINLINE void vint16::load (int v0, int v1, int v2, int v3, 5739 int v4, int v5, int v6, int v7, 5740 int v8, int v9, int v10, int v11, 5741 int v12, int v13, int v14, int v15) { 5742 #if OIIO_SIMD_AVX >= 512 5743 m_simd = _mm512_setr_epi32 (v0, v1, v2, v3, v4, v5, v6, v7, 5744 v8, v9, v10, v11, v12, v13, v14, v15); 5745 #else 5746 m_val[ 0] = v0; 5747 m_val[ 1] = v1; 5748 m_val[ 2] = v2; 5749 m_val[ 3] = v3; 5750 m_val[ 4] = v4; 5751 m_val[ 5] = v5; 5752 m_val[ 6] = v6; 5753 m_val[ 7] = v7; 5754 m_val[ 8] = v8; 5755 m_val[ 9] = v9; 5756 m_val[10] = v10; 5757 m_val[11] = v11; 5758 m_val[12] = v12; 5759 m_val[13] = v13; 5760 m_val[14] = v14; 5761 m_val[15] = v15; 5762 #endif 5763 } 5764 5765 5766 OIIO_FORCEINLINE void vint16::load (const int *values) { 5767 #if OIIO_SIMD_AVX >= 512 5768 m_simd = _mm512_loadu_si512 ((const simd_t *)values); 5769 #else 5770 m_8[0].load (values); 5771 m_8[1].load (values+8); 5772 #endif 5773 } 5774 5775 5776 OIIO_FORCEINLINE void vint16::load (const int *values, int n) 5777 { 5778 #if OIIO_SIMD_AVX >= 512 5779 m_simd = _mm512_maskz_loadu_epi32 (__mmask16(~(0xffff << n)), values); 5780 #else 5781 if (n > 8) { 5782 m_8[0].load (values); 5783 m_8[1].load (values+8, n-8); 5784 } else { 5785 m_8[0].load (values, n); 5786 m_8[1].clear (); 5787 } 5788 #endif 5789 } 5790 5791 5792 OIIO_FORCEINLINE void vint16::load (const short *values) { 5793 #if OIIO_SIMD_AVX >= 512 5794 m_simd = _mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)values)); 5795 #else 5796 m_8[0].load (values); 5797 m_8[1].load (values+8); 5798 #endif 5799 } 5800 5801 OIIO_FORCEINLINE void vint16::load (const unsigned short *values) { 5802 #if OIIO_SIMD_AVX >= 512 5803 m_simd = _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)values)); 5804 #else 5805 m_8[0].load (values); 5806 m_8[1].load (values+8); 5807 #endif 5808 } 5809 5810 5811 OIIO_FORCEINLINE void vint16::load (const char *values) { 5812 #if OIIO_SIMD_AVX >= 512 5813 m_simd = _mm512_cvtepi8_epi32(_mm_loadu_si128((__m128i*)values)); 5814 #else 5815 m_8[0].load (values); 5816 m_8[1].load (values+8); 5817 #endif 5818 } 5819 5820 OIIO_FORCEINLINE void vint16::load (const unsigned char *values) { 5821 #if OIIO_SIMD_AVX >= 512 5822 m_simd = _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)values)); 5823 #else 5824 m_8[0].load (values); 5825 m_8[1].load (values+8); 5826 #endif 5827 } 5828 5829 5830 OIIO_FORCEINLINE vint16::vint16 (int a) { load(a); } 5831 5832 OIIO_FORCEINLINE vint16::vint16 (int v0, int v1, int v2, int v3, 5833 int v4, int v5, int v6, int v7, 5834 int v8, int v9, int v10, int v11, 5835 int v12, int v13, int v14, int v15) { 5836 load (v0, v1, v2, v3, v4, v5, v6, v7, 5837 v8, v9, v10, v11, v12, v13, v14, v15); 5838 } 5839 5840 OIIO_FORCEINLINE vint16::vint16 (const int *vals) { load (vals); } 5841 OIIO_FORCEINLINE vint16::vint16 (const unsigned short *vals) { load(vals); } 5842 OIIO_FORCEINLINE vint16::vint16 (const short *vals) { load(vals); } 5843 OIIO_FORCEINLINE vint16::vint16 (const unsigned char *vals) { load(vals); } 5844 OIIO_FORCEINLINE vint16::vint16 (const char *vals) { load(vals); } 5845 5846 OIIO_FORCEINLINE const vint16 & vint16::operator= (int a) { load(a); return *this; } 5847 5848 5849 OIIO_FORCEINLINE void vint16::load_mask (const vbool16 &mask, const int *values) { 5850 #if OIIO_SIMD_AVX >= 512 5851 m_simd = _mm512_maskz_loadu_epi32 (mask, (const simd_t *)values); 5852 #else 5853 m_8[0].load_mask (mask.lo(), values); 5854 m_8[1].load_mask (mask.hi(), values+8); 5855 #endif 5856 } 5857 5858 5859 OIIO_FORCEINLINE void vint16::store_mask (const vbool16 &mask, int *values) const { 5860 #if OIIO_SIMD_AVX >= 512 5861 _mm512_mask_storeu_epi32 (values, mask.bitmask(), m_simd); 5862 #else 5863 lo().store_mask (mask.lo(), values); 5864 hi().store_mask (mask.hi(), values+8); 5865 #endif 5866 } 5867 5868 5869 template <int scale> 5870 OIIO_FORCEINLINE void 5871 vint16::gather (const value_t *baseptr, const vint_t& vindex) { 5872 #if OIIO_SIMD_AVX >= 512 5873 m_simd = _mm512_i32gather_epi32 (vindex, baseptr, scale); 5874 #else 5875 m_8[0].gather<scale> (baseptr, vindex.lo()); 5876 m_8[1].gather<scale> (baseptr, vindex.hi()); 5877 #endif 5878 } 5879 5880 template<int scale> 5881 OIIO_FORCEINLINE void 5882 vint16::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex) { 5883 #if OIIO_SIMD_AVX >= 512 5884 m_simd = _mm512_mask_i32gather_epi32 (m_simd, mask, vindex, baseptr, scale); 5885 #else 5886 m_8[0].gather_mask<scale> (mask.lo(), baseptr, vindex.lo()); 5887 m_8[1].gather_mask<scale> (mask.hi(), baseptr, vindex.hi()); 5888 #endif 5889 } 5890 5891 template<int scale> 5892 OIIO_FORCEINLINE void 5893 vint16::scatter (value_t *baseptr, const vint_t& vindex) const { 5894 #if OIIO_SIMD_AVX >= 512 5895 _mm512_i32scatter_epi32 (baseptr, vindex, m_simd, scale); 5896 #else 5897 lo().scatter<scale> (baseptr, vindex.lo()); 5898 hi().scatter<scale> (baseptr, vindex.hi()); 5899 #endif 5900 } 5901 5902 template<int scale> 5903 OIIO_FORCEINLINE void 5904 vint16::scatter_mask (const bool_t& mask, value_t *baseptr, 5905 const vint_t& vindex) const { 5906 #if OIIO_SIMD_AVX >= 512 5907 _mm512_mask_i32scatter_epi32 (baseptr, mask, vindex, m_simd, scale); 5908 #else 5909 lo().scatter_mask<scale> (mask.lo(), baseptr, vindex.lo()); 5910 hi().scatter_mask<scale> (mask.hi(), baseptr, vindex.hi()); 5911 #endif 5912 } 5913 5914 5915 OIIO_FORCEINLINE void vint16::store (int *values) const { 5916 #if OIIO_SIMD_AVX >= 512 5917 // Use an unaligned store -- it's just as fast when the memory turns 5918 // out to be aligned, nearly as fast even when unaligned. Not worth 5919 // the headache of using stores that require alignment. 5920 _mm512_storeu_si512 ((simd_t *)values, m_simd); 5921 #else 5922 lo().store (values); 5923 hi().store (values+8); 5924 #endif 5925 } 5926 5927 5928 OIIO_FORCEINLINE void vint16::clear () { 5929 #if OIIO_SIMD_AVX >= 512 5930 m_simd = _mm512_setzero_si512(); 5931 #else 5932 *this = 0; 5933 #endif 5934 } 5935 5936 5937 OIIO_FORCEINLINE const vint16 vint16::Zero () { 5938 #if OIIO_SIMD_AVX >= 512 5939 return _mm512_setzero_epi32(); 5940 #else 5941 return 0; 5942 #endif 5943 } 5944 5945 OIIO_FORCEINLINE const vint16 vint16::One () { return vint16(1); } 5946 5947 OIIO_FORCEINLINE const vint16 vint16::NegOne () { return vint16(-1); } 5948 5949 5950 OIIO_FORCEINLINE const vint16 vint16::Iota (int start, int step) { 5951 return vint16 (start+0*step, start+1*step, start+2*step, start+3*step, 5952 start+4*step, start+5*step, start+6*step, start+7*step, 5953 start+8*step, start+9*step, start+10*step, start+11*step, 5954 start+12*step, start+13*step, start+14*step, start+15*step); 5955 } 5956 5957 5958 OIIO_FORCEINLINE const vint16 vint16::Giota () { 5959 return vint16 (1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 5960 1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15); 5961 } 5962 5963 5964 OIIO_FORCEINLINE vint8 vint16::lo () const { 5965 #if OIIO_SIMD_AVX >= 512 5966 return _mm512_castsi512_si256 (simd()); 5967 #else 5968 return m_8[0]; 5969 #endif 5970 } 5971 5972 OIIO_FORCEINLINE vint8 vint16::hi () const { 5973 #if OIIO_SIMD_AVX >= 512 5974 return _mm512_extracti64x4_epi64 (simd(), 1); 5975 #else 5976 return m_8[1]; 5977 #endif 5978 } 5979 5980 5981 OIIO_FORCEINLINE vint16::vint16 (const vint8& lo, const vint8 &hi) { 5982 #if OIIO_SIMD_AVX >= 512 5983 __m512i r = _mm512_castsi256_si512 (lo); 5984 m_simd = _mm512_inserti32x8 (r, hi, 1); 5985 #else 5986 m_8[0] = lo; 5987 m_8[1] = hi; 5988 #endif 5989 } 5990 5991 5992 OIIO_FORCEINLINE vint16::vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d) { 5993 #if OIIO_SIMD_AVX >= 512 5994 m_simd = _mm512_broadcast_i32x4(a); 5995 m_simd = _mm512_inserti32x4 (m_simd, b, 1); 5996 m_simd = _mm512_inserti32x4 (m_simd, c, 2); 5997 m_simd = _mm512_inserti32x4 (m_simd, d, 3); 5998 #else 5999 m_8[0] = vint8(a,b); 6000 m_8[1] = vint8(c,d); 6001 #endif 6002 } 6003 6004 6005 OIIO_FORCEINLINE vint16 operator+ (const vint16& a, const vint16& b) { 6006 #if OIIO_SIMD_AVX >= 512 6007 return _mm512_add_epi32 (a.simd(), b.simd()); 6008 #else 6009 return vint16 (a.lo()+b.lo(), a.hi()+b.hi()); 6010 #endif 6011 } 6012 6013 6014 OIIO_FORCEINLINE const vint16& operator+= (vint16& a, const vint16& b) { 6015 return a = a + b; 6016 } 6017 6018 6019 OIIO_FORCEINLINE vint16 operator- (const vint16& a) { 6020 #if OIIO_SIMD_AVX >= 512 6021 return _mm512_sub_epi32 (_mm512_setzero_si512(), a); 6022 #else 6023 return vint16 (-a.lo(), -a.hi()); 6024 #endif 6025 } 6026 6027 6028 OIIO_FORCEINLINE vint16 operator- (const vint16& a, const vint16& b) { 6029 #if OIIO_SIMD_AVX >= 512 6030 return _mm512_sub_epi32 (a.simd(), b.simd()); 6031 #else 6032 return vint16 (a.lo()-b.lo(), a.hi()-b.hi()); 6033 #endif 6034 } 6035 6036 6037 OIIO_FORCEINLINE const vint16 &operator-= (vint16& a, const vint16& b) { 6038 return a = a - b; 6039 } 6040 6041 6042 OIIO_FORCEINLINE vint16 operator* (const vint16& a, const vint16& b) { 6043 #if OIIO_SIMD_AVX >= 512 6044 return _mm512_mullo_epi32 (a.simd(), b.simd()); 6045 #else 6046 return vint16 (a.lo()*b.lo(), a.hi()*b.hi()); 6047 #endif 6048 } 6049 6050 6051 OIIO_FORCEINLINE const vint16& operator*= (vint16& a, const vint16& b) { return a = a * b; } 6052 OIIO_FORCEINLINE const vint16& operator*= (vint16& a, int b) { return a = a * b; } 6053 6054 6055 OIIO_FORCEINLINE vint16 operator/ (const vint16& a, const vint16& b) { 6056 // NO INTEGER DIVISION IN AVX512! 6057 SIMD_RETURN (vint16, a[i] / b[i]); 6058 } 6059 6060 OIIO_FORCEINLINE const vint16& operator/= (vint16& a, const vint16& b) { return a = a / b; } 6061 6062 6063 OIIO_FORCEINLINE vint16 operator% (const vint16& a, const vint16& b) { 6064 // NO INTEGER MODULUS IN AVX512! 6065 SIMD_RETURN (vint16, a[i] % b[i]); 6066 } 6067 6068 OIIO_FORCEINLINE const vint16& operator%= (vint16& a, const vint16& b) { return a = a % b; } 6069 6070 OIIO_FORCEINLINE vint16 operator% (const vint16& a, int w) { 6071 // NO INTEGER MODULUS in AVX512! 6072 SIMD_RETURN (vint16, a[i] % w); 6073 } 6074 6075 OIIO_FORCEINLINE const vint16& operator%= (vint16& a, int b) { return a = a % b; } 6076 6077 6078 OIIO_FORCEINLINE vint16 operator& (const vint16& a, const vint16& b) { 6079 #if OIIO_SIMD_AVX >= 512 6080 return _mm512_and_si512 (a.simd(), b.simd()); 6081 #else 6082 return vint16 (a.lo() & b.lo(), a.hi() & b.hi()); 6083 #endif 6084 } 6085 6086 OIIO_FORCEINLINE const vint16& operator&= (vint16& a, const vint16& b) { return a = a & b; } 6087 6088 OIIO_FORCEINLINE vint16 operator| (const vint16& a, const vint16& b) { 6089 #if OIIO_SIMD_AVX >= 512 6090 return _mm512_or_si512 (a.simd(), b.simd()); 6091 #else 6092 return vint16 (a.lo() | b.lo(), a.hi() | b.hi()); 6093 #endif 6094 } 6095 6096 OIIO_FORCEINLINE const vint16& operator|= (vint16& a, const vint16& b) { return a = a | b; } 6097 6098 OIIO_FORCEINLINE vint16 operator^ (const vint16& a, const vint16& b) { 6099 #if OIIO_SIMD_AVX >= 512 6100 return _mm512_xor_si512 (a.simd(), b.simd()); 6101 #else 6102 return vint16 (a.lo() ^ b.lo(), a.hi() ^ b.hi()); 6103 #endif 6104 } 6105 6106 OIIO_FORCEINLINE const vint16& operator^= (vint16& a, const vint16& b) { return a = a ^ b; } 6107 6108 6109 OIIO_FORCEINLINE vint16 operator~ (const vint16& a) { 6110 #if OIIO_SIMD_AVX >= 512 6111 return a ^ a.NegOne(); 6112 #else 6113 return vint16 (~a.lo(), ~a.hi()); 6114 #endif 6115 } 6116 6117 6118 OIIO_FORCEINLINE vint16 operator<< (const vint16& a, const unsigned int bits) { 6119 #if OIIO_SIMD_AVX >= 512 6120 return _mm512_sllv_epi32 (a, vint16(int(bits))); 6121 // return _mm512_slli_epi32 (a, bits); 6122 // FIXME: can this be slli? 6123 #else 6124 return vint16 (a.lo() << bits, a.hi() << bits); 6125 #endif 6126 } 6127 6128 6129 OIIO_FORCEINLINE const vint16& operator<<= (vint16& a, const unsigned int bits) { 6130 return a = a << bits; 6131 } 6132 6133 OIIO_FORCEINLINE vint16 operator>> (const vint16& a, const unsigned int bits) { 6134 #if OIIO_SIMD_AVX >= 512 6135 return _mm512_srav_epi32 (a, vint16(int(bits))); 6136 // FIXME: can this be srai? 6137 #else 6138 return vint16 (a.lo() >> bits, a.hi() >> bits); 6139 #endif 6140 } 6141 6142 OIIO_FORCEINLINE const vint16& operator>>= (vint16& a, const unsigned int bits) { 6143 return a = a >> bits; 6144 } 6145 6146 6147 OIIO_FORCEINLINE vint16 srl (const vint16& a, const unsigned int bits) { 6148 #if OIIO_SIMD_AVX >= 512 6149 return _mm512_srlv_epi32 (a, vint16(int(bits))); 6150 // FIXME: can this be srli? 6151 #else 6152 return vint16 (srl(a.lo(), bits), srl (a.hi(), bits)); 6153 #endif 6154 } 6155 6156 6157 OIIO_FORCEINLINE vbool16 operator== (const vint16& a, const vint16& b) { 6158 #if OIIO_SIMD_AVX >= 512 6159 return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 0 /*_MM_CMPINT_EQ*/); 6160 #else /* Fall back to 8-wide */ 6161 return vbool16 (a.lo() == b.lo(), a.hi() == b.hi()); 6162 #endif 6163 } 6164 6165 6166 OIIO_FORCEINLINE vbool16 operator!= (const vint16& a, const vint16& b) { 6167 #if OIIO_SIMD_AVX >= 512 6168 return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 4 /*_MM_CMPINT_NEQ*/); 6169 #else /* Fall back to 8-wide */ 6170 return vbool16 (a.lo() != b.lo(), a.hi() != b.hi()); 6171 #endif 6172 } 6173 6174 6175 OIIO_FORCEINLINE vbool16 operator> (const vint16& a, const vint16& b) { 6176 #if OIIO_SIMD_AVX >= 512 6177 return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 6 /*_MM_CMPINT_NLE*/); 6178 #else /* Fall back to 8-wide */ 6179 return vbool16 (a.lo() > b.lo(), a.hi() > b.hi()); 6180 #endif 6181 } 6182 6183 6184 OIIO_FORCEINLINE vbool16 operator< (const vint16& a, const vint16& b) { 6185 #if OIIO_SIMD_AVX >= 512 6186 return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 1 /*_MM_CMPINT_LT*/); 6187 #else /* Fall back to 8-wide */ 6188 return vbool16 (a.lo() < b.lo(), a.hi() < b.hi()); 6189 #endif 6190 } 6191 6192 6193 OIIO_FORCEINLINE vbool16 operator>= (const vint16& a, const vint16& b) { 6194 #if OIIO_SIMD_AVX >= 512 6195 return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 5 /*_MM_CMPINT_NLT*/); 6196 #else /* Fall back to 8-wide */ 6197 return vbool16 (a.lo() >= b.lo(), a.hi() >= b.hi()); 6198 #endif 6199 } 6200 6201 6202 OIIO_FORCEINLINE vbool16 operator<= (const vint16& a, const vint16& b) { 6203 #if OIIO_SIMD_AVX >= 512 6204 return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 2 /*_MM_CMPINT_LE*/); 6205 #else /* Fall back to 8-wide */ 6206 return vbool16 (a.lo() <= b.lo(), a.hi() <= b.hi()); 6207 #endif 6208 } 6209 6210 6211 inline std::ostream& operator<< (std::ostream& cout, const vint16& val) { 6212 cout << val[0]; 6213 for (int i = 1; i < val.elements; ++i) 6214 cout << ' ' << val[i]; 6215 return cout; 6216 } 6217 6218 6219 6220 OIIO_FORCEINLINE void vint16::store (int *values, int n) const { 6221 OIIO_DASSERT (n >= 0 && n <= elements); 6222 #if 0 && OIIO_SIMD_AVX >= 512 6223 // This SHOULD be fast, but in my benchmarks, it is slower! 6224 // (At least on the AVX512 hardware I have, Xeon Silver 4110.) 6225 // Re-test this periodically with new Intel hardware. 6226 _mm512_mask_storeu_epi32 (values, __mmask16(~(0xffff << n)), m_simd); 6227 #else 6228 if (n > 8) { 6229 m_8[0].store (values); 6230 m_8[1].store (values+8, n-8); 6231 } else { 6232 m_8[0].store (values, n); 6233 } 6234 #endif 6235 } 6236 6237 6238 OIIO_FORCEINLINE void vint16::store (unsigned short *values) const { 6239 #if OIIO_SIMD_AVX512 6240 _mm512_mask_cvtepi32_storeu_epi16 (values, __mmask16(0xff), m_simd); 6241 #elif OIIO_SIMD_AVX >= 2 6242 lo().store (values); 6243 hi().store (values+8); 6244 #else 6245 SIMD_DO (values[i] = m_val[i]); 6246 #endif 6247 } 6248 6249 6250 OIIO_FORCEINLINE void vint16::store (unsigned char *values) const { 6251 #if OIIO_SIMD_AVX512 6252 _mm512_mask_cvtepi32_storeu_epi8 (values, __mmask16(0xff), m_simd); 6253 #elif OIIO_SIMD_AVX >= 2 6254 lo().store (values); 6255 hi().store (values+8); 6256 #else 6257 SIMD_DO (values[i] = m_val[i]); 6258 #endif 6259 } 6260 6261 6262 6263 // Shuffle groups of 4 6264 template<int i0, int i1, int i2, int i3> 6265 vint16 shuffle4 (const vint16& a) { 6266 #if OIIO_SIMD_AVX >= 512 6267 __m512 x = _mm512_castsi512_ps(a); 6268 return _mm512_castps_si512(_mm512_shuffle_f32x4(x,x,_MM_SHUFFLE(i3,i2,i1,i0))); 6269 #else 6270 vint4 x[4]; 6271 a.store ((int *)x); 6272 return vint16 (x[i0], x[i1], x[i2], x[i3]); 6273 #endif 6274 } 6275 6276 template<int i> vint16 shuffle4 (const vint16& a) { 6277 return shuffle4<i,i,i,i> (a); 6278 } 6279 6280 template<int i0, int i1, int i2, int i3> 6281 vint16 shuffle (const vint16& a) { 6282 #if OIIO_SIMD_AVX >= 512 6283 __m512 x = _mm512_castsi512_ps(a); 6284 return _mm512_castps_si512(_mm512_permute_ps(x,_MM_SHUFFLE(i3,i2,i1,i0))); 6285 #else 6286 vint4 x[4]; 6287 a.store ((int *)x); 6288 return vint16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]), 6289 shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3])); 6290 #endif 6291 } 6292 6293 template<int i> vint16 shuffle (const vint16& a) { 6294 return shuffle<i,i,i,i> (a); 6295 } 6296 6297 6298 template<int i> 6299 OIIO_FORCEINLINE int extract (const vint16& a) { 6300 return a[i]; 6301 } 6302 6303 6304 template<int i> 6305 OIIO_FORCEINLINE vint16 insert (const vint16& a, int val) { 6306 vint16 tmp = a; 6307 tmp[i] = val; 6308 return tmp; 6309 } 6310 6311 6312 OIIO_FORCEINLINE int vint16::x () const { 6313 #if OIIO_SIMD_AVX >= 512 6314 return _mm_cvtsi128_si32(_mm512_castsi512_si128(m_simd)); 6315 #else 6316 return m_val[0]; 6317 #endif 6318 } 6319 6320 OIIO_FORCEINLINE int vint16::y () const { return m_val[1]; } 6321 OIIO_FORCEINLINE int vint16::z () const { return m_val[2]; } 6322 OIIO_FORCEINLINE int vint16::w () const { return m_val[3]; } 6323 OIIO_FORCEINLINE void vint16::set_x (int val) { m_val[0] = val; } 6324 OIIO_FORCEINLINE void vint16::set_y (int val) { m_val[1] = val; } 6325 OIIO_FORCEINLINE void vint16::set_z (int val) { m_val[2] = val; } 6326 OIIO_FORCEINLINE void vint16::set_w (int val) { m_val[3] = val; } 6327 6328 6329 OIIO_FORCEINLINE vint16 bitcast_to_int (const vbool16& x) 6330 { 6331 #if OIIO_SIMD_AVX >= 512 6332 return _mm512_maskz_set1_epi32 (x, -1); 6333 #else 6334 return vint16 (bitcast_to_int(x.lo()), bitcast_to_int(x.hi())); 6335 #endif 6336 } 6337 6338 6339 OIIO_FORCEINLINE vint16 vreduce_add (const vint16& v) { 6340 #if OIIO_SIMD_AVX >= 512 6341 // Nomenclature: ABCD are the vint4's comprising v 6342 // First, add the vint4's and make them all the same 6343 vint16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed 6344 vint16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD); // ABCD in all quads 6345 // Now, add within each vint4 6346 vint16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(w); // each adjacent int is summed 6347 return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd); 6348 #else 6349 vint8 sum = vreduce_add(v.lo()) + vreduce_add(v.hi()); 6350 return vint16 (sum, sum); 6351 #endif 6352 } 6353 6354 6355 OIIO_FORCEINLINE int reduce_add (const vint16& v) { 6356 #if OIIO_SIMD_AVX >= 512 6357 return vreduce_add(v).x(); 6358 #else 6359 return reduce_add(v.lo()) + reduce_add(v.hi()); 6360 #endif 6361 } 6362 6363 6364 OIIO_FORCEINLINE int reduce_and (const vint16& v) { 6365 #if OIIO_SIMD_AVX >= 512 6366 // Nomenclature: ABCD are the vint4's comprising v 6367 // First, and the vint4's and make them all the same 6368 vint16 AB_AB_CD_CD = v & shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed 6369 vint16 w = AB_AB_CD_CD & shuffle4<2,3,0,1>(AB_AB_CD_CD); 6370 // Now, and within each vint4 6371 vint16 ab_ab_cd_cd = w & shuffle<1,0,3,2>(w); // each adjacent int is summed 6372 vint16 r = ab_ab_cd_cd & shuffle<2,3,0,1>(ab_ab_cd_cd); 6373 return r.x(); 6374 #else 6375 return reduce_and(v.lo()) & reduce_and(v.hi()); 6376 #endif 6377 } 6378 6379 6380 OIIO_FORCEINLINE int reduce_or (const vint16& v) { 6381 #if OIIO_SIMD_AVX >= 512 6382 // Nomenclature: ABCD are the vint4's comprising v 6383 // First, or the vint4's or make them all the same 6384 vint16 AB_AB_CD_CD = v | shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed 6385 vint16 w = AB_AB_CD_CD | shuffle4<2,3,0,1>(AB_AB_CD_CD); 6386 // Now, or within each vint4 6387 vint16 ab_ab_cd_cd = w | shuffle<1,0,3,2>(w); // each adjacent int is summed 6388 vint16 r = ab_ab_cd_cd | shuffle<2,3,0,1>(ab_ab_cd_cd); 6389 return r.x(); 6390 #else 6391 return reduce_or(v.lo()) | reduce_or(v.hi()); 6392 #endif 6393 } 6394 6395 6396 6397 OIIO_FORCEINLINE vint16 blend (const vint16& a, const vint16& b, const vbool16& mask) { 6398 #if OIIO_SIMD_AVX >= 512 6399 return _mm512_mask_blend_epi32 (mask, a, b); 6400 #else 6401 return vint16 (blend (a.lo(), b.lo(), mask.lo()), 6402 blend (a.hi(), b.hi(), mask.hi())); 6403 #endif 6404 } 6405 6406 6407 OIIO_FORCEINLINE vint16 blend0 (const vint16& a, const vbool16& mask) { 6408 #if OIIO_SIMD_AVX >= 512 6409 return _mm512_maskz_mov_epi32 (mask, a); 6410 #else 6411 return vint16 (blend0 (a.lo(), mask.lo()), 6412 blend0 (a.hi(), mask.hi())); 6413 #endif 6414 } 6415 6416 6417 OIIO_FORCEINLINE vint16 blend0not (const vint16& a, const vbool16& mask) { 6418 #if OIIO_SIMD_AVX >= 512 6419 return _mm512_maskz_mov_epi32 (!mask, a); 6420 #else 6421 return vint16 (blend0not (a.lo(), mask.lo()), 6422 blend0not (a.hi(), mask.hi())); 6423 #endif 6424 } 6425 6426 OIIO_FORCEINLINE vint16 select (const vbool16& mask, const vint16& a, const vint16& b) { 6427 return blend (b, a, mask); 6428 } 6429 6430 6431 OIIO_FORCEINLINE vint16 abs (const vint16& a) { 6432 #if OIIO_SIMD_AVX >= 512 6433 return _mm512_abs_epi32(a.simd()); 6434 #else 6435 return vint16 (abs(a.lo()), abs(a.hi())); 6436 #endif 6437 } 6438 6439 6440 OIIO_FORCEINLINE vint16 min (const vint16& a, const vint16& b) { 6441 #if OIIO_SIMD_AVX >= 512 6442 return _mm512_min_epi32 (a, b); 6443 #else 6444 return vint16 (min(a.lo(), b.lo()), min(a.hi(), b.hi())); 6445 #endif 6446 } 6447 6448 6449 OIIO_FORCEINLINE vint16 max (const vint16& a, const vint16& b) { 6450 #if OIIO_SIMD_AVX >= 512 6451 return _mm512_max_epi32 (a, b); 6452 #else 6453 return vint16 (max(a.lo(), b.lo()), max(a.hi(), b.hi())); 6454 #endif 6455 } 6456 6457 6458 OIIO_FORCEINLINE vint16 rotl(const vint16& x, int s) { 6459 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 6460 // return _mm512_rol_epi32 (x, s); 6461 // We want to do this ^^^ but this intrinsic only takes an *immediate* 6462 // argument for s, and there isn't a way to express in C++ that a 6463 // parameter must be an immediate/literal value from the caller. 6464 return (x<<s) | srl(x,32-s); 6465 #else 6466 return (x<<s) | srl(x,32-s); 6467 #endif 6468 } 6469 6470 // DEPRECATED (2.1) 6471 OIIO_FORCEINLINE vint16 rotl32 (const vint16& x, const unsigned int k) { 6472 return rotl(x, k); 6473 } 6474 6475 6476 OIIO_FORCEINLINE vint16 andnot (const vint16& a, const vint16& b) { 6477 #if OIIO_SIMD_AVX >= 512 6478 return _mm512_andnot_epi32 (a.simd(), b.simd()); 6479 #else 6480 return vint16 (andnot(a.lo(), b.lo()), andnot(a.hi(), b.hi())); 6481 #endif 6482 } 6483 6484 6485 6486 OIIO_FORCEINLINE vint16 safe_mod (const vint16& a, const vint16& b) { 6487 // NO INTEGER MODULUS IN SSE! 6488 SIMD_RETURN (vint16, b[i] ? a[i] % b[i] : 0); 6489 } 6490 6491 OIIO_FORCEINLINE vint16 safe_mod (const vint16& a, int b) { 6492 return b ? (a % b) : vint16::Zero(); 6493 } 6494 6495 6496 6497 6498 6499 ////////////////////////////////////////////////////////////////////// 6500 // vfloat4 implementation 6501 6502 6503 OIIO_FORCEINLINE vfloat4::vfloat4 (const vint4& ival) { 6504 #if OIIO_SIMD_SSE 6505 m_simd = _mm_cvtepi32_ps (ival.simd()); 6506 #elif OIIO_SIMD_NEON 6507 m_simd = vcvtq_f32_s32(ival.simd()); 6508 #else 6509 SIMD_CONSTRUCT (float(ival[i])); 6510 #endif 6511 } 6512 6513 6514 OIIO_FORCEINLINE const vfloat4 vfloat4::Zero () { 6515 #if OIIO_SIMD_SSE 6516 return _mm_setzero_ps(); 6517 #else 6518 return vfloat4(0.0f); 6519 #endif 6520 } 6521 6522 OIIO_FORCEINLINE const vfloat4 vfloat4::One () { 6523 return vfloat4(1.0f); 6524 } 6525 6526 OIIO_FORCEINLINE const vfloat4 vfloat4::Iota (float start, float step) { 6527 return vfloat4 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step); 6528 } 6529 6530 /// Set all components to 0.0 6531 OIIO_FORCEINLINE void vfloat4::clear () { 6532 #if OIIO_SIMD_SSE 6533 m_simd = _mm_setzero_ps(); 6534 #else 6535 load (0.0f); 6536 #endif 6537 } 6538 6539 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator= (const Imath::V4f &v) { 6540 load ((const float *)&v); 6541 return *this; 6542 } 6543 6544 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator= (const Imath::V3f &v) { 6545 load (v[0], v[1], v[2], 0.0f); 6546 return *this; 6547 } 6548 6549 OIIO_FORCEINLINE float& vfloat4::operator[] (int i) { 6550 OIIO_DASSERT(i<elements); 6551 return m_val[i]; 6552 } 6553 6554 OIIO_FORCEINLINE float vfloat4::operator[] (int i) const { 6555 OIIO_DASSERT(i<elements); 6556 return m_val[i]; 6557 } 6558 6559 6560 OIIO_FORCEINLINE void vfloat4::load (float val) { 6561 #if OIIO_SIMD_SSE 6562 m_simd = _mm_set1_ps (val); 6563 #elif OIIO_SIMD_NEON 6564 m_simd = vdupq_n_f32 (val); 6565 #else 6566 SIMD_CONSTRUCT (val); 6567 #endif 6568 } 6569 6570 OIIO_FORCEINLINE void vfloat4::load (float a, float b, float c, float d) { 6571 #if OIIO_SIMD_SSE 6572 m_simd = _mm_set_ps (d, c, b, a); 6573 #elif OIIO_SIMD_NEON 6574 float values[4] = { a, b, c, d }; 6575 m_simd = vld1q_f32 (values); 6576 #else 6577 m_val[0] = a; 6578 m_val[1] = b; 6579 m_val[2] = c; 6580 m_val[3] = d; 6581 #endif 6582 } 6583 6584 /// Load from an array of 4 values 6585 OIIO_FORCEINLINE void vfloat4::load (const float *values) { 6586 #if OIIO_SIMD_SSE 6587 m_simd = _mm_loadu_ps (values); 6588 #elif OIIO_SIMD_NEON 6589 m_simd = vld1q_f32 (values); 6590 #else 6591 SIMD_CONSTRUCT (values[i]); 6592 #endif 6593 } 6594 6595 6596 OIIO_FORCEINLINE void vfloat4::load (const float *values, int n) { 6597 OIIO_DASSERT (n >= 0 && n <= elements); 6598 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 6599 m_simd = _mm_maskz_loadu_ps (__mmask8(~(0xf << n)), values); 6600 #elif OIIO_SIMD_SSE 6601 switch (n) { 6602 case 1: 6603 m_simd = _mm_load_ss (values); 6604 break; 6605 case 2: 6606 // Trickery: load one double worth of bits! 6607 m_simd = _mm_castpd_ps (_mm_load_sd ((const double*)values)); 6608 break; 6609 case 3: 6610 m_simd = _mm_setr_ps (values[0], values[1], values[2], 0.0f); 6611 // This looks wasteful, but benchmarks show that it's the 6612 // fastest way to set 3 values with the 4th getting zero. 6613 // Actually, gcc and clang both turn it into something more 6614 // efficient than _mm_setr_ps. The version below looks smart, 6615 // but was much more expensive as the _mm_setr_ps! 6616 // __m128 xy = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)values)); 6617 // m_simd = _mm_movelh_ps(xy, _mm_load_ss (values + 2)); 6618 break; 6619 case 4: 6620 m_simd = _mm_loadu_ps (values); 6621 break; 6622 default: 6623 clear(); 6624 break; 6625 } 6626 #elif OIIO_SIMD_NEON 6627 switch (n) { 6628 case 1: m_simd = vdupq_n_f32(0); m_simd[0] = values[0]; break; 6629 case 2: load (values[0], values[1], 0.0f, 0.0f); break; 6630 case 3: load (values[0], values[1], values[2], 0.0f); break; 6631 case 4: m_simd = vld1q_f32 (values); break; 6632 default: break; 6633 } 6634 #else 6635 for (int i = 0; i < n; ++i) 6636 m_val[i] = values[i]; 6637 for (int i = n; i < paddedelements; ++i) 6638 m_val[i] = 0; 6639 #endif 6640 } 6641 6642 6643 OIIO_FORCEINLINE void vfloat4::load (const unsigned short *values) { 6644 #if OIIO_SIMD_SSE >= 2 6645 m_simd = _mm_cvtepi32_ps (vint4(values).simd()); 6646 // You might guess that the following is faster, but it's NOT: 6647 // NO! m_simd = _mm_cvtpu16_ps (*(__m64*)values); 6648 #else 6649 SIMD_CONSTRUCT (values[i]); 6650 #endif 6651 } 6652 6653 6654 OIIO_FORCEINLINE void vfloat4::load (const short *values) { 6655 #if OIIO_SIMD_SSE >= 2 6656 m_simd = _mm_cvtepi32_ps (vint4(values).simd()); 6657 #else 6658 SIMD_CONSTRUCT (values[i]); 6659 #endif 6660 } 6661 6662 6663 OIIO_FORCEINLINE void vfloat4::load (const unsigned char *values) { 6664 #if OIIO_SIMD_SSE >= 2 6665 m_simd = _mm_cvtepi32_ps (vint4(values).simd()); 6666 #else 6667 SIMD_CONSTRUCT (values[i]); 6668 #endif 6669 } 6670 6671 // Load from an array of 4 char values, convert to float 6672 OIIO_FORCEINLINE void vfloat4::load (const char *values) { 6673 #if OIIO_SIMD_SSE >= 2 6674 m_simd = _mm_cvtepi32_ps (vint4(values).simd()); 6675 #else 6676 SIMD_CONSTRUCT (values[i]); 6677 #endif 6678 } 6679 6680 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 6681 OIIO_FORCEINLINE void vfloat4::load (const half *values) { 6682 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE 6683 /* Enabled 16 bit float instructions! */ 6684 __m128i a = _mm_castpd_si128 (_mm_load_sd ((const double *)values)); 6685 m_simd = _mm_cvtph_ps (a); 6686 #elif OIIO_SIMD_SSE >= 2 6687 // SSE half-to-float by Fabian "ryg" Giesen. Public domain. 6688 // https://gist.github.com/rygorous/2144712 6689 vint4 h ((const unsigned short *)values); 6690 # define CONSTI(name) *(const __m128i *)&name 6691 # define CONSTF(name) *(const __m128 *)&name 6692 OIIO_SIMD_UINT4_CONST(mask_nosign, 0x7fff); 6693 OIIO_SIMD_UINT4_CONST(magic, (254 - 15) << 23); 6694 OIIO_SIMD_UINT4_CONST(was_infnan, 0x7bff); 6695 OIIO_SIMD_UINT4_CONST(exp_infnan, 255 << 23); 6696 __m128i mnosign = CONSTI(mask_nosign); 6697 __m128i expmant = _mm_and_si128(mnosign, h); 6698 __m128i justsign = _mm_xor_si128(h, expmant); 6699 __m128i expmant2 = expmant; // copy (just here for counting purposes) 6700 __m128i shifted = _mm_slli_epi32(expmant, 13); 6701 __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), *(const __m128 *)&magic); 6702 __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, CONSTI(was_infnan)); 6703 __m128i sign = _mm_slli_epi32(justsign, 16); 6704 __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), CONSTF(exp_infnan)); 6705 __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp); 6706 __m128 final = _mm_or_ps(scaled, sign_inf); 6707 // ~11 SSE2 ops. 6708 m_simd = final; 6709 # undef CONSTI 6710 # undef CONSTF 6711 #else /* No SIMD defined: */ 6712 SIMD_CONSTRUCT (values[i]); 6713 #endif 6714 } 6715 #endif /* _HALF_H_ or _IMATH_H_ */ 6716 6717 OIIO_FORCEINLINE void vfloat4::store (float *values) const { 6718 #if OIIO_SIMD_SSE 6719 // Use an unaligned store -- it's just as fast when the memory turns 6720 // out to be aligned, nearly as fast even when unaligned. Not worth 6721 // the headache of using stores that require alignment. 6722 _mm_storeu_ps (values, m_simd); 6723 #elif OIIO_SIMD_NEON 6724 vst1q_f32 (values, m_simd); 6725 #else 6726 SIMD_DO (values[i] = m_val[i]); 6727 #endif 6728 } 6729 6730 OIIO_FORCEINLINE void vfloat4::store (float *values, int n) const { 6731 OIIO_DASSERT (n >= 0 && n <= 4); 6732 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 6733 // This SHOULD be fast, but in my benchmarks, it is slower! 6734 // (At least on the AVX512 hardware I have, Xeon Silver 4110.) 6735 // Re-test this periodically with new Intel hardware. 6736 _mm_mask_storeu_ps (values, __mmask8(~(0xf << n)), m_simd); 6737 #elif OIIO_SIMD_SSE 6738 switch (n) { 6739 case 1: 6740 _mm_store_ss (values, m_simd); 6741 break; 6742 case 2: 6743 // Trickery: store two floats as a double worth of bits 6744 _mm_store_sd ((double*)values, _mm_castps_pd(m_simd)); 6745 break; 6746 case 3: 6747 values[0] = m_val[0]; 6748 values[1] = m_val[1]; 6749 values[2] = m_val[2]; 6750 // This looks wasteful, but benchmarks show that it's the 6751 // fastest way to store 3 values, in benchmarks was faster than 6752 // this, below: 6753 // _mm_store_sd ((double*)values, _mm_castps_pd(m_simd)); 6754 // _mm_store_ss (values + 2, _mm_movehl_ps(m_simd,m_simd)); 6755 break; 6756 case 4: 6757 store (values); 6758 break; 6759 default: 6760 break; 6761 } 6762 #elif OIIO_SIMD_NEON 6763 switch (n) { 6764 case 1: 6765 vst1q_lane_f32 (values, m_simd, 0); 6766 break; 6767 case 2: 6768 vst1q_lane_f32 (values++, m_simd, 0); 6769 vst1q_lane_f32 (values, m_simd, 1); 6770 break; 6771 case 3: 6772 vst1q_lane_f32 (values++, m_simd, 0); 6773 vst1q_lane_f32 (values++, m_simd, 1); 6774 vst1q_lane_f32 (values, m_simd, 2); 6775 break; 6776 case 4: 6777 vst1q_f32 (values, m_simd); break; 6778 default: 6779 break; 6780 } 6781 #else 6782 for (int i = 0; i < n; ++i) 6783 values[i] = m_val[i]; 6784 #endif 6785 } 6786 6787 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 6788 OIIO_FORCEINLINE void vfloat4::store (half *values) const { 6789 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE 6790 __m128i h = _mm_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); 6791 _mm_store_sd ((double *)values, _mm_castsi128_pd(h)); 6792 #else 6793 SIMD_DO (values[i] = m_val[i]); 6794 #endif 6795 } 6796 #endif 6797 6798 6799 OIIO_FORCEINLINE void vfloat4::load_mask (int mask, const float *values) { 6800 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 6801 m_simd = _mm_maskz_loadu_ps (__mmask8(mask), (const simd_t *)values); 6802 #elif OIIO_SIMD_AVX 6803 m_simd = _mm_maskload_ps (values, _mm_castps_si128(vbool_t::from_bitmask(mask))); 6804 #else 6805 SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0.0f); 6806 #endif 6807 } 6808 6809 6810 OIIO_FORCEINLINE void vfloat4::load_mask (const vbool_t& mask, const float *values) { 6811 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 6812 m_simd = _mm_maskz_loadu_ps (__mmask8(mask.bitmask()), (const simd_t *)values); 6813 #elif OIIO_SIMD_AVX 6814 m_simd = _mm_maskload_ps (values, _mm_castps_si128(mask)); 6815 #else 6816 SIMD_CONSTRUCT (mask[i] ? values[i] : 0.0f); 6817 #endif 6818 } 6819 6820 6821 OIIO_FORCEINLINE void vfloat4::store_mask (int mask, float *values) const { 6822 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 6823 _mm_mask_storeu_ps (values, __mmask8(mask), m_simd); 6824 #elif OIIO_SIMD_AVX 6825 _mm_maskstore_ps (values, _mm_castps_si128(vbool_t::from_bitmask(mask)), m_simd); 6826 #else 6827 SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]); 6828 #endif 6829 } 6830 6831 6832 OIIO_FORCEINLINE void vfloat4::store_mask (const vbool_t& mask, float *values) const { 6833 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 6834 _mm_mask_storeu_ps (values, __mmask8(mask.bitmask()), m_simd); 6835 #elif OIIO_SIMD_AVX 6836 _mm_maskstore_ps (values, _mm_castps_si128(mask.simd()), m_simd); 6837 #else 6838 SIMD_DO (if (mask[i]) values[i] = (*this)[i]); 6839 #endif 6840 } 6841 6842 6843 template <int scale> 6844 OIIO_FORCEINLINE void 6845 vfloat4::gather (const value_t *baseptr, const vint_t& vindex) 6846 { 6847 #if OIIO_SIMD_AVX >= 2 6848 m_simd = _mm_i32gather_ps (baseptr, vindex, scale); 6849 #else 6850 SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale)); 6851 #endif 6852 } 6853 6854 template<int scale> 6855 OIIO_FORCEINLINE void 6856 vfloat4::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex) 6857 { 6858 #if OIIO_SIMD_AVX >= 2 6859 m_simd = _mm_mask_i32gather_ps (m_simd, baseptr, vindex, mask, scale); 6860 #else 6861 SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0); 6862 #endif 6863 } 6864 6865 template<int scale> 6866 OIIO_FORCEINLINE void 6867 vfloat4::scatter (value_t *baseptr, const vint_t& vindex) const 6868 { 6869 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 6870 // FIXME: disable because it benchmarks slower than the dumb way 6871 _mm_i32scatter_ps (baseptr, vindex, m_simd, scale); 6872 #else 6873 SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]); 6874 #endif 6875 } 6876 6877 template<int scale> 6878 OIIO_FORCEINLINE void 6879 vfloat4::scatter_mask (const bool_t& mask, value_t *baseptr, 6880 const vint_t& vindex) const 6881 { 6882 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 6883 // FIXME: disable because it benchmarks slower than the dumb way 6884 _mm_mask_i32scatter_ps (baseptr, mask.bitmask(), vindex, m_simd, scale); 6885 #else 6886 SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]); 6887 #endif 6888 } 6889 6890 6891 OIIO_FORCEINLINE vfloat4 operator+ (const vfloat4& a, const vfloat4& b) { 6892 #if OIIO_SIMD_SSE 6893 return _mm_add_ps (a.m_simd, b.m_simd); 6894 #elif OIIO_SIMD_NEON 6895 return vaddq_f32 (a.m_simd, b.m_simd); 6896 #else 6897 SIMD_RETURN (vfloat4, a[i] + b[i]); 6898 #endif 6899 } 6900 6901 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator+= (const vfloat4& a) { 6902 #if OIIO_SIMD_SSE 6903 m_simd = _mm_add_ps (m_simd, a.m_simd); 6904 #elif OIIO_SIMD_NEON 6905 m_simd = vaddq_f32 (m_simd, a.m_simd); 6906 #else 6907 SIMD_DO (m_val[i] += a[i]); 6908 #endif 6909 return *this; 6910 } 6911 6912 OIIO_FORCEINLINE vfloat4 vfloat4::operator- () const { 6913 #if OIIO_SIMD_SSE 6914 return _mm_sub_ps (_mm_setzero_ps(), m_simd); 6915 #elif OIIO_SIMD_NEON 6916 return vsubq_f32 (Zero(), m_simd); 6917 #else 6918 SIMD_RETURN (vfloat4, -m_val[i]); 6919 #endif 6920 } 6921 6922 OIIO_FORCEINLINE vfloat4 operator- (const vfloat4& a, const vfloat4& b) { 6923 #if OIIO_SIMD_SSE 6924 return _mm_sub_ps (a.m_simd, b.m_simd); 6925 #elif OIIO_SIMD_NEON 6926 return vsubq_f32 (a.m_simd, b.m_simd); 6927 #else 6928 SIMD_RETURN (vfloat4, a[i] - b[i]); 6929 #endif 6930 } 6931 6932 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator-= (const vfloat4& a) { 6933 #if OIIO_SIMD_SSE 6934 m_simd = _mm_sub_ps (m_simd, a.m_simd); 6935 #elif OIIO_SIMD_NEON 6936 m_simd = vsubq_f32 (m_simd, a.m_simd); 6937 #else 6938 SIMD_DO (m_val[i] -= a[i]); 6939 #endif 6940 return *this; 6941 } 6942 6943 OIIO_FORCEINLINE vfloat4 operator* (const vfloat4& a, float b) { 6944 #if OIIO_SIMD_SSE 6945 return _mm_mul_ps (a.m_simd, _mm_set1_ps(b)); 6946 #elif OIIO_SIMD_NEON 6947 return vmulq_n_f32 (a.m_simd, b); 6948 #else 6949 SIMD_RETURN (vfloat4, a[i] * b); 6950 #endif 6951 } 6952 6953 OIIO_FORCEINLINE vfloat4 operator* (float a, const vfloat4& b) { 6954 return b * a; 6955 } 6956 6957 OIIO_FORCEINLINE vfloat4 operator* (const vfloat4& a, const vfloat4& b) { 6958 #if OIIO_SIMD_SSE 6959 return _mm_mul_ps (a.m_simd, b.m_simd); 6960 #elif OIIO_SIMD_NEON 6961 return vmulq_f32 (a.m_simd, b.m_simd); 6962 #else 6963 SIMD_RETURN (vfloat4, a[i] * b[i]); 6964 #endif 6965 } 6966 6967 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator*= (const vfloat4& a) { 6968 #if OIIO_SIMD_SSE 6969 m_simd = _mm_mul_ps (m_simd, a.m_simd); 6970 #elif OIIO_SIMD_NEON 6971 m_simd = vmulq_f32 (m_simd, a.m_simd); 6972 #else 6973 SIMD_DO (m_val[i] *= a[i]); 6974 #endif 6975 return *this; 6976 } 6977 6978 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator*= (float val) { 6979 #if OIIO_SIMD_SSE 6980 m_simd = _mm_mul_ps (m_simd, _mm_set1_ps(val)); 6981 #elif OIIO_SIMD_NEON 6982 m_simd = vmulq_n_f32 (m_simd, val); 6983 #else 6984 SIMD_DO (m_val[i] *= val); 6985 #endif 6986 return *this; 6987 } 6988 6989 OIIO_FORCEINLINE vfloat4 operator/ (const vfloat4& a, const vfloat4& b) { 6990 #if OIIO_SIMD_SSE 6991 return _mm_div_ps (a.m_simd, b.m_simd); 6992 #elif OIIO_SIMD_NEON 6993 return vdivq_f32 (a.m_simd, b.m_simd); 6994 #else 6995 SIMD_RETURN (vfloat4, a[i] / b[i]); 6996 #endif 6997 } 6998 6999 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator/= (const vfloat4& a) { 7000 #if OIIO_SIMD_SSE 7001 m_simd = _mm_div_ps (m_simd, a.m_simd); 7002 #elif OIIO_SIMD_NEON 7003 m_simd = vdivq_f32 (m_simd, a.m_simd); 7004 #else 7005 SIMD_DO (m_val[i] /= a[i]); 7006 #endif 7007 return *this; 7008 } 7009 7010 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator/= (float val) { 7011 #if OIIO_SIMD_SSE 7012 m_simd = _mm_div_ps (m_simd, _mm_set1_ps(val)); 7013 #elif OIIO_SIMD_NEON 7014 m_simd = vdivq_f32 (m_simd, vfloat4(val)); 7015 #else 7016 SIMD_DO (m_val[i] /= val); 7017 #endif 7018 return *this; 7019 } 7020 7021 OIIO_FORCEINLINE vbool4 operator== (const vfloat4& a, const vfloat4& b) { 7022 #if OIIO_SIMD_SSE 7023 return _mm_cmpeq_ps (a.m_simd, b.m_simd); 7024 #elif OIIO_SIMD_NEON 7025 return vceqq_f32 (a.m_simd, b.m_simd); 7026 #else 7027 SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0); 7028 #endif 7029 } 7030 7031 OIIO_FORCEINLINE vbool4 operator!= (const vfloat4& a, const vfloat4& b) { 7032 #if OIIO_SIMD_SSE 7033 return _mm_cmpneq_ps (a.m_simd, b.m_simd); 7034 #elif OIIO_SIMD_NEON 7035 // implemented as NOT(a == b) 7036 return vmvnq_u32(vceqq_f32 (a.m_simd, b.m_simd)); 7037 #else 7038 SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0); 7039 #endif 7040 } 7041 7042 OIIO_FORCEINLINE vbool4 operator< (const vfloat4& a, const vfloat4& b) { 7043 #if OIIO_SIMD_SSE 7044 return _mm_cmplt_ps (a.m_simd, b.m_simd); 7045 #elif OIIO_SIMD_NEON 7046 return vcltq_f32 (a.m_simd, b.m_simd); 7047 #else 7048 SIMD_RETURN (vbool4, a[i] < b[i] ? -1 : 0); 7049 #endif 7050 } 7051 7052 OIIO_FORCEINLINE vbool4 operator> (const vfloat4& a, const vfloat4& b) { 7053 #if OIIO_SIMD_SSE 7054 return _mm_cmpgt_ps (a.m_simd, b.m_simd); 7055 #elif OIIO_SIMD_NEON 7056 return vcgtq_f32 (a.m_simd, b.m_simd); 7057 #else 7058 SIMD_RETURN (vbool4, a[i] > b[i] ? -1 : 0); 7059 #endif 7060 } 7061 7062 OIIO_FORCEINLINE vbool4 operator>= (const vfloat4& a, const vfloat4& b) { 7063 #if OIIO_SIMD_SSE 7064 return _mm_cmpge_ps (a.m_simd, b.m_simd); 7065 #elif OIIO_SIMD_NEON 7066 return vcgeq_f32 (a.m_simd, b.m_simd); 7067 #else 7068 SIMD_RETURN (vbool4, a[i] >= b[i] ? -1 : 0); 7069 #endif 7070 } 7071 7072 OIIO_FORCEINLINE vbool4 operator<= (const vfloat4& a, const vfloat4& b) { 7073 #if OIIO_SIMD_SSE 7074 return _mm_cmple_ps (a.m_simd, b.m_simd); 7075 #elif OIIO_SIMD_NEON 7076 return vcleq_f32 (a.m_simd, b.m_simd); 7077 #else 7078 SIMD_RETURN (vbool4, a[i] <= b[i] ? -1 : 0); 7079 #endif 7080 } 7081 7082 OIIO_FORCEINLINE vfloat4 AxyBxy (const vfloat4& a, const vfloat4& b) { 7083 #if OIIO_SIMD_SSE 7084 return _mm_movelh_ps (a.m_simd, b.m_simd); 7085 #else 7086 return vfloat4 (a[0], a[1], b[0], b[1]); 7087 #endif 7088 } 7089 7090 OIIO_FORCEINLINE vfloat4 AxBxAyBy (const vfloat4& a, const vfloat4& b) { 7091 #if OIIO_SIMD_SSE 7092 return _mm_unpacklo_ps (a.m_simd, b.m_simd); 7093 #else 7094 return vfloat4 (a[0], b[0], a[1], b[1]); 7095 #endif 7096 } 7097 7098 OIIO_FORCEINLINE vfloat4 vfloat4::xyz0 () const { 7099 return insert<3>(*this, 0.0f); 7100 } 7101 7102 OIIO_FORCEINLINE vfloat4 vfloat4::xyz1 () const { 7103 return insert<3>(*this, 1.0f); 7104 } 7105 7106 inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val) { 7107 cout << val[0]; 7108 for (int i = 1; i < val.elements; ++i) 7109 cout << ' ' << val[i]; 7110 return cout; 7111 } 7112 7113 7114 // Implementation had to be after the definition of vfloat4. 7115 OIIO_FORCEINLINE vint4::vint4 (const vfloat4& f) 7116 { 7117 #if OIIO_SIMD_SSE 7118 m_simd = _mm_cvttps_epi32(f.simd()); 7119 #else 7120 SIMD_CONSTRUCT ((int) f[i]); 7121 #endif 7122 } 7123 7124 7125 template<int i0, int i1, int i2, int i3> 7126 OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { 7127 #if OIIO_SIMD_SSE 7128 return shuffle_sse<i0,i1,i2,i3> (__m128(a)); 7129 #else 7130 return vfloat4(a[i0], a[i1], a[i2], a[i3]); 7131 #endif 7132 } 7133 7134 template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { return shuffle<i,i,i,i>(a); } 7135 7136 #if OIIO_SIMD_NEON 7137 template<> OIIO_FORCEINLINE vfloat4 shuffle<0> (const vfloat4& a) { 7138 float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,0); 7139 } 7140 template<> OIIO_FORCEINLINE vfloat4 shuffle<1> (const vfloat4& a) { 7141 float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,1); 7142 } 7143 template<> OIIO_FORCEINLINE vfloat4 shuffle<2> (const vfloat4& a) { 7144 float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,0); 7145 } 7146 template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) { 7147 float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,1); 7148 } 7149 #endif 7150 7151 7152 7153 /// Helper: as rapid as possible extraction of one component, when the 7154 /// index is fixed. 7155 template<int i> 7156 OIIO_FORCEINLINE float extract (const vfloat4& a) { 7157 #if OIIO_SIMD_SSE 7158 return _mm_cvtss_f32(shuffle_sse<i,i,i,i>(a.simd())); 7159 #else 7160 return a[i]; 7161 #endif 7162 } 7163 7164 #if OIIO_SIMD_SSE 7165 template<> OIIO_FORCEINLINE float extract<0> (const vfloat4& a) { 7166 return _mm_cvtss_f32(a.simd()); 7167 } 7168 #endif 7169 7170 7171 /// Helper: substitute val for a[i] 7172 template<int i> 7173 OIIO_FORCEINLINE vfloat4 insert (const vfloat4& a, float val) { 7174 #if OIIO_SIMD_SSE >= 4 7175 return _mm_insert_ps (a, _mm_set_ss(val), i<<4); 7176 #else 7177 vfloat4 tmp = a; 7178 tmp[i] = val; 7179 return tmp; 7180 #endif 7181 } 7182 7183 #if OIIO_SIMD_SSE 7184 // Slightly faster special cases for SSE 7185 template<> OIIO_FORCEINLINE vfloat4 insert<0> (const vfloat4& a, float val) { 7186 return _mm_move_ss (a.simd(), _mm_set_ss(val)); 7187 } 7188 #endif 7189 7190 7191 OIIO_FORCEINLINE float vfloat4::x () const { return extract<0>(*this); } 7192 OIIO_FORCEINLINE float vfloat4::y () const { return extract<1>(*this); } 7193 OIIO_FORCEINLINE float vfloat4::z () const { return extract<2>(*this); } 7194 OIIO_FORCEINLINE float vfloat4::w () const { return extract<3>(*this); } 7195 OIIO_FORCEINLINE void vfloat4::set_x (float val) { *this = insert<0>(*this, val); } 7196 OIIO_FORCEINLINE void vfloat4::set_y (float val) { *this = insert<1>(*this, val); } 7197 OIIO_FORCEINLINE void vfloat4::set_z (float val) { *this = insert<2>(*this, val); } 7198 OIIO_FORCEINLINE void vfloat4::set_w (float val) { *this = insert<3>(*this, val); } 7199 7200 7201 OIIO_FORCEINLINE vint4 bitcast_to_int (const vfloat4& x) 7202 { 7203 #if OIIO_SIMD_SSE 7204 return _mm_castps_si128 (x.simd()); 7205 #else 7206 return *(vint4 *)&x; 7207 #endif 7208 } 7209 7210 OIIO_FORCEINLINE vfloat4 bitcast_to_float (const vint4& x) 7211 { 7212 #if OIIO_SIMD_SSE 7213 return _mm_castsi128_ps (x.simd()); 7214 #else 7215 return *(vfloat4 *)&x; 7216 #endif 7217 } 7218 7219 7220 // Old names: 7221 inline vint4 bitcast_to_int4 (const vfloat4& x) { return bitcast_to_int(x); } 7222 inline vfloat4 bitcast_to_float4 (const vint4& x) { return bitcast_to_float(x); } 7223 7224 7225 7226 OIIO_FORCEINLINE vfloat4 vreduce_add (const vfloat4& v) { 7227 #if OIIO_SIMD_SSE >= 3 7228 // People seem to agree that SSE3 does add reduction best with 2 7229 // horizontal adds. 7230 // suppose v = (a, b, c, d) 7231 simd::vfloat4 ab_cd = _mm_hadd_ps (v.simd(), v.simd()); 7232 // ab_cd = (a+b, c+d, a+b, c+d) 7233 simd::vfloat4 abcd = _mm_hadd_ps (ab_cd.simd(), ab_cd.simd()); 7234 // all abcd elements are a+b+c+d 7235 return abcd; 7236 #elif OIIO_SIMD_SSE 7237 // I think this is the best we can do for SSE2, and I'm still not sure 7238 // it's faster than the default scalar operation. But anyway... 7239 // suppose v = (a, b, c, d) 7240 vfloat4 ab_ab_cd_cd = shuffle<1,0,3,2>(v) + v; 7241 // now x = (b,a,d,c) + (a,b,c,d) = (a+b,a+b,c+d,c+d) 7242 vfloat4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd); 7243 // now y = (c+d,c+d,a+b,a+b) 7244 vfloat4 abcd = ab_ab_cd_cd + cd_cd_ab_ab; // a+b+c+d in all components 7245 return abcd; 7246 #else 7247 return vfloat4 (v[0] + v[1] + v[2] + v[3]); 7248 #endif 7249 } 7250 7251 7252 OIIO_FORCEINLINE float reduce_add (const vfloat4& v) { 7253 #if OIIO_SIMD_SSE 7254 return _mm_cvtss_f32(vreduce_add (v)); 7255 #elif OIIO_SIMD_NEON 7256 return vaddvq_f32(v); 7257 #else 7258 return v[0] + v[1] + v[2] + v[3]; 7259 #endif 7260 } 7261 7262 OIIO_FORCEINLINE vfloat4 vdot (const vfloat4 &a, const vfloat4 &b) { 7263 #if OIIO_SIMD_SSE >= 4 7264 return _mm_dp_ps (a.simd(), b.simd(), 0xff); 7265 #elif OIIO_SIMD_NEON 7266 float32x4_t ab = vmulq_f32(a, b); 7267 float32x4_t sum1 = vaddq_f32(ab, vrev64q_f32(ab)); 7268 return vaddq_f32(sum1, vcombine_f32(vget_high_f32(sum1), vget_low_f32(sum1))); 7269 #else 7270 return vreduce_add (a*b); 7271 #endif 7272 } 7273 7274 OIIO_FORCEINLINE float dot (const vfloat4 &a, const vfloat4 &b) { 7275 #if OIIO_SIMD_SSE >= 4 7276 return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0xff)); 7277 #else 7278 return reduce_add (a*b); 7279 #endif 7280 } 7281 7282 OIIO_FORCEINLINE vfloat4 vdot3 (const vfloat4 &a, const vfloat4 &b) { 7283 #if OIIO_SIMD_SSE >= 4 7284 return _mm_dp_ps (a.simd(), b.simd(), 0x7f); 7285 #else 7286 return vreduce_add((a*b).xyz0()); 7287 #endif 7288 } 7289 7290 OIIO_FORCEINLINE float dot3 (const vfloat4 &a, const vfloat4 &b) { 7291 #if OIIO_SIMD_SSE >= 4 7292 return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0x77)); 7293 #else 7294 return reduce_add ((a*b).xyz0()); 7295 #endif 7296 } 7297 7298 7299 OIIO_FORCEINLINE vfloat4 blend (const vfloat4& a, const vfloat4& b, const vbool4& mask) 7300 { 7301 #if OIIO_SIMD_SSE >= 4 7302 // SSE >= 4.1 only 7303 return _mm_blendv_ps (a.simd(), b.simd(), mask.simd()); 7304 #elif OIIO_SIMD_SSE 7305 // Trick for SSE < 4.1 7306 return _mm_or_ps (_mm_and_ps(mask.simd(), b.simd()), 7307 _mm_andnot_ps(mask.simd(), a.simd())); 7308 #elif OIIO_SIMD_NEON 7309 return vbslq_f32 (mask.simd(), b.simd(), a.simd()); 7310 #else 7311 return vfloat4 (mask[0] ? b[0] : a[0], 7312 mask[1] ? b[1] : a[1], 7313 mask[2] ? b[2] : a[2], 7314 mask[3] ? b[3] : a[3]); 7315 #endif 7316 } 7317 7318 7319 OIIO_FORCEINLINE vfloat4 blend0 (const vfloat4& a, const vbool4& mask) 7320 { 7321 #if OIIO_SIMD_SSE 7322 return _mm_and_ps(mask.simd(), a.simd()); 7323 #else 7324 return vfloat4 (mask[0] ? a[0] : 0.0f, 7325 mask[1] ? a[1] : 0.0f, 7326 mask[2] ? a[2] : 0.0f, 7327 mask[3] ? a[3] : 0.0f); 7328 #endif 7329 } 7330 7331 7332 OIIO_FORCEINLINE vfloat4 blend0not (const vfloat4& a, const vbool4& mask) 7333 { 7334 #if OIIO_SIMD_SSE 7335 return _mm_andnot_ps(mask.simd(), a.simd()); 7336 #else 7337 return vfloat4 (mask[0] ? 0.0f : a[0], 7338 mask[1] ? 0.0f : a[1], 7339 mask[2] ? 0.0f : a[2], 7340 mask[3] ? 0.0f : a[3]); 7341 #endif 7342 } 7343 7344 7345 OIIO_FORCEINLINE vfloat4 safe_div (const vfloat4 &a, const vfloat4 &b) { 7346 #if OIIO_SIMD_SSE 7347 return blend0not (a/b, b == vfloat4::Zero()); 7348 #else 7349 return vfloat4 (b[0] == 0.0f ? 0.0f : a[0] / b[0], 7350 b[1] == 0.0f ? 0.0f : a[1] / b[1], 7351 b[2] == 0.0f ? 0.0f : a[2] / b[2], 7352 b[3] == 0.0f ? 0.0f : a[3] / b[3]); 7353 #endif 7354 } 7355 7356 7357 OIIO_FORCEINLINE vfloat3 hdiv (const vfloat4 &a) 7358 { 7359 #if OIIO_SIMD_SSE 7360 return vfloat3(safe_div(a, shuffle<3>(a)).xyz0()); 7361 #else 7362 float d = a[3]; 7363 return d == 0.0f ? vfloat3 (0.0f) : vfloat3 (a[0]/d, a[1]/d, a[2]/d); 7364 #endif 7365 } 7366 7367 7368 7369 OIIO_FORCEINLINE vfloat4 select (const vbool4& mask, const vfloat4& a, const vfloat4& b) 7370 { 7371 return blend (b, a, mask); 7372 } 7373 7374 7375 OIIO_FORCEINLINE vfloat4 abs (const vfloat4& a) 7376 { 7377 #if OIIO_SIMD_SSE 7378 // Just clear the sign bit for cheap fabsf 7379 return _mm_and_ps (a.simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); 7380 #elif OIIO_SIMD_NEON 7381 return vabsq_f32(a.simd()); 7382 #else 7383 SIMD_RETURN (vfloat4, fabsf(a[i])); 7384 #endif 7385 } 7386 7387 7388 OIIO_FORCEINLINE vfloat4 sign (const vfloat4& a) 7389 { 7390 vfloat4 one(1.0f); 7391 return blend (one, -one, a < vfloat4::Zero()); 7392 } 7393 7394 7395 OIIO_FORCEINLINE vfloat4 ceil (const vfloat4& a) 7396 { 7397 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 7398 return _mm_ceil_ps (a); 7399 #else 7400 SIMD_RETURN (vfloat4, ceilf(a[i])); 7401 #endif 7402 } 7403 7404 OIIO_FORCEINLINE vfloat4 floor (const vfloat4& a) 7405 { 7406 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 7407 return _mm_floor_ps (a); 7408 #else 7409 SIMD_RETURN (vfloat4, floorf(a[i])); 7410 #endif 7411 } 7412 7413 OIIO_FORCEINLINE vfloat4 round (const vfloat4& a) 7414 { 7415 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 7416 return _mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); 7417 #else 7418 SIMD_RETURN (vfloat4, roundf(a[i])); 7419 #endif 7420 } 7421 7422 OIIO_FORCEINLINE vint4 ifloor (const vfloat4& a) 7423 { 7424 // FIXME: look into this, versus the method of quick_floor in texturesys.cpp 7425 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 7426 return vint4(floor(a)); 7427 #else 7428 SIMD_RETURN (vint4, (int)floorf(a[i])); 7429 #endif 7430 } 7431 7432 7433 OIIO_FORCEINLINE vint4 rint (const vfloat4& a) 7434 { 7435 return vint4 (round(a)); 7436 } 7437 7438 7439 OIIO_FORCEINLINE vfloat4 rcp_fast (const vfloat4 &a) 7440 { 7441 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED 7442 // avx512vl directly has rcp14 on float4 7443 vfloat4 r = _mm_rcp14_ps(a); 7444 return r * nmadd(r,a,vfloat4(2.0f)); 7445 #elif OIIO_SIMD_AVX512 7446 // Trickery: in and out of the 512 bit registers to use fast approx rcp 7447 vfloat16 r = _mm512_rcp14_ps(_mm512_castps128_ps512(a)); 7448 return _mm512_castps512_ps128(r); 7449 #elif OIIO_SIMD_SSE 7450 vfloat4 r = _mm_rcp_ps(a); 7451 return r * nmadd(r,a,vfloat4(2.0f)); 7452 #else 7453 SIMD_RETURN (vfloat4, 1.0f/a[i]); 7454 #endif 7455 } 7456 7457 7458 OIIO_FORCEINLINE vfloat4 sqrt (const vfloat4 &a) 7459 { 7460 #if OIIO_SIMD_SSE 7461 return _mm_sqrt_ps (a.simd()); 7462 #else 7463 SIMD_RETURN (vfloat4, sqrtf(a[i])); 7464 #endif 7465 } 7466 7467 7468 OIIO_FORCEINLINE vfloat4 rsqrt (const vfloat4 &a) 7469 { 7470 #if OIIO_SIMD_SSE 7471 return _mm_div_ps (_mm_set1_ps(1.0f), _mm_sqrt_ps (a.simd())); 7472 #else 7473 SIMD_RETURN (vfloat4, 1.0f/sqrtf(a[i])); 7474 #endif 7475 } 7476 7477 7478 OIIO_FORCEINLINE vfloat4 rsqrt_fast (const vfloat4 &a) 7479 { 7480 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED 7481 // Trickery: in and out of the 512 bit registers to use fast approx rsqrt 7482 return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC)); 7483 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 7484 // Trickery: in and out of the 512 bit registers to use fast approx rsqrt 7485 return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a))); 7486 #elif OIIO_SIMD_SSE 7487 return _mm_rsqrt_ps (a.simd()); 7488 #else 7489 SIMD_RETURN (vfloat4, 1.0f/sqrtf(a[i])); 7490 #endif 7491 } 7492 7493 7494 OIIO_FORCEINLINE vfloat4 min (const vfloat4& a, const vfloat4& b) 7495 { 7496 #if OIIO_SIMD_SSE 7497 return _mm_min_ps (a, b); 7498 #elif OIIO_SIMD_NEON 7499 return vminq_f32(a, b); 7500 #else 7501 SIMD_RETURN (vfloat4, std::min (a[i], b[i])); 7502 #endif 7503 } 7504 7505 OIIO_FORCEINLINE vfloat4 max (const vfloat4& a, const vfloat4& b) 7506 { 7507 #if OIIO_SIMD_SSE 7508 return _mm_max_ps (a, b); 7509 #elif OIIO_SIMD_NEON 7510 return vmaxq_f32(a, b); 7511 #else 7512 SIMD_RETURN (vfloat4, std::max (a[i], b[i])); 7513 #endif 7514 } 7515 7516 7517 OIIO_FORCEINLINE vfloat4 andnot (const vfloat4& a, const vfloat4& b) { 7518 #if OIIO_SIMD_SSE 7519 return _mm_andnot_ps (a.simd(), b.simd()); 7520 #else 7521 const int *ai = (const int *)&a; 7522 const int *bi = (const int *)&b; 7523 return bitcast_to_float (vint4(~(ai[0]) & bi[0], 7524 ~(ai[1]) & bi[1], 7525 ~(ai[2]) & bi[2], 7526 ~(ai[3]) & bi[3])); 7527 #endif 7528 } 7529 7530 7531 OIIO_FORCEINLINE vfloat4 madd (const simd::vfloat4& a, const simd::vfloat4& b, 7532 const simd::vfloat4& c) 7533 { 7534 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED 7535 // If we are sure _mm_fmadd_ps intrinsic is available, use it. 7536 return _mm_fmadd_ps (a, b, c); 7537 #elif OIIO_SIMD_NEON 7538 return vmlaq_f32(c.simd(), a.simd(), b.simd()); 7539 #elif OIIO_SIMD_SSE && !defined(_MSC_VER) 7540 // If we directly access the underlying __m128, on some platforms and 7541 // compiler flags, it will turn into fma anyway, even if we don't use 7542 // the intrinsic. 7543 return a.simd() * b.simd() + c.simd(); 7544 #else 7545 // Fallback: just use regular math and hope for the best. 7546 return a * b + c; 7547 #endif 7548 } 7549 7550 7551 OIIO_FORCEINLINE vfloat4 msub (const simd::vfloat4& a, const simd::vfloat4& b, 7552 const simd::vfloat4& c) 7553 { 7554 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED 7555 // If we are sure _mm_fnmsub_ps intrinsic is available, use it. 7556 return _mm_fmsub_ps (a, b, c); 7557 #elif OIIO_SIMD_SSE && !defined(_MSC_VER) 7558 // If we directly access the underlying __m128, on some platforms and 7559 // compiler flags, it will turn into fma anyway, even if we don't use 7560 // the intrinsic. 7561 return a.simd() * b.simd() - c.simd(); 7562 #else 7563 // Fallback: just use regular math and hope for the best. 7564 return a * b - c; 7565 #endif 7566 } 7567 7568 7569 7570 OIIO_FORCEINLINE vfloat4 nmadd (const simd::vfloat4& a, const simd::vfloat4& b, 7571 const simd::vfloat4& c) 7572 { 7573 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED 7574 // If we are sure _mm_fnmadd_ps intrinsic is available, use it. 7575 return _mm_fnmadd_ps (a, b, c); 7576 #elif OIIO_SIMD_SSE && !defined(_MSC_VER) 7577 // If we directly access the underlying __m128, on some platforms and 7578 // compiler flags, it will turn into fma anyway, even if we don't use 7579 // the intrinsic. 7580 return c.simd() - a.simd() * b.simd(); 7581 #else 7582 // Fallback: just use regular math and hope for the best. 7583 return c - a * b; 7584 #endif 7585 } 7586 7587 7588 7589 OIIO_FORCEINLINE vfloat4 nmsub (const simd::vfloat4& a, const simd::vfloat4& b, 7590 const simd::vfloat4& c) 7591 { 7592 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED 7593 // If we are sure _mm_fnmsub_ps intrinsic is available, use it. 7594 return _mm_fnmsub_ps (a, b, c); 7595 #elif OIIO_SIMD_SSE && !defined(_MSC_VER) 7596 // If we directly access the underlying __m128, on some platforms and 7597 // compiler flags, it will turn into fma anyway, even if we don't use 7598 // the intrinsic. 7599 return -(a.simd() * b.simd()) - c.simd(); 7600 #else 7601 // Fallback: just use regular math and hope for the best. 7602 return -(a * b) - c; 7603 #endif 7604 } 7605 7606 7607 7608 // Full precision exp() of all components of a SIMD vector. 7609 template<typename T> 7610 OIIO_FORCEINLINE T exp (const T& v) 7611 { 7612 #if OIIO_SIMD_SSE 7613 // Implementation inspired by: 7614 // https://github.com/embree/embree/blob/master/common/simd/sse_special.h 7615 // Which is listed as Copyright (C) 2007 Julien Pommier and distributed 7616 // under the zlib license. 7617 typedef typename T::vint_t int_t; 7618 T x = v; 7619 const float exp_hi (88.3762626647949f); 7620 const float exp_lo (-88.3762626647949f); 7621 const float cephes_LOG2EF (1.44269504088896341f); 7622 const float cephes_exp_C1 (0.693359375f); 7623 const float cephes_exp_C2 (-2.12194440e-4f); 7624 const float cephes_exp_p0 (1.9875691500E-4f); 7625 const float cephes_exp_p1 (1.3981999507E-3f); 7626 const float cephes_exp_p2 (8.3334519073E-3f); 7627 const float cephes_exp_p3 (4.1665795894E-2f); 7628 const float cephes_exp_p4 (1.6666665459E-1f); 7629 const float cephes_exp_p5 (5.0000001201E-1f); 7630 T tmp (0.0f); 7631 T one (1.0f); 7632 x = min (x, T(exp_hi)); 7633 x = max (x, T(exp_lo)); 7634 T fx = madd (x, T(cephes_LOG2EF), T(0.5f)); 7635 int_t emm0 = int_t(fx); 7636 tmp = T(emm0); 7637 T mask = bitcast_to_float (bitcast_to_int(tmp > fx) & bitcast_to_int(one)); 7638 fx = tmp - mask; 7639 tmp = fx * cephes_exp_C1; 7640 T z = fx * cephes_exp_C2; 7641 x = x - tmp; 7642 x = x - z; 7643 z = x * x; 7644 T y = cephes_exp_p0; 7645 y = madd (y, x, cephes_exp_p1); 7646 y = madd (y, x, cephes_exp_p2); 7647 y = madd (y, x, cephes_exp_p3); 7648 y = madd (y, x, cephes_exp_p4); 7649 y = madd (y, x, cephes_exp_p5); 7650 y = madd (y, z, x); 7651 y = y + one; 7652 emm0 = (int_t(fx) + int_t(0x7f)) << 23; 7653 T pow2n = bitcast_to_float(emm0); 7654 y = y * pow2n; 7655 return y; 7656 #else 7657 SIMD_RETURN (T, expf(v[i])); 7658 #endif 7659 } 7660 7661 7662 7663 // Full precision log() of all components of a SIMD vector. 7664 template<typename T> 7665 OIIO_FORCEINLINE T log (const T& v) 7666 { 7667 #if OIIO_SIMD_SSE 7668 // Implementation inspired by: 7669 // https://github.com/embree/embree/blob/master/common/simd/sse_special.h 7670 // Which is listed as Copyright (C) 2007 Julien Pommier and distributed 7671 // under the zlib license. 7672 typedef typename T::vint_t int_t; 7673 typedef typename T::vbool_t bool_t; 7674 T x = v; 7675 int_t emm0; 7676 T zero (T::Zero()); 7677 T one (1.0f); 7678 bool_t invalid_mask = (x <= zero); 7679 const int min_norm_pos ((int)0x00800000); 7680 const int inv_mant_mask ((int)~0x7f800000); 7681 x = max(x, bitcast_to_float(int_t(min_norm_pos))); /* cut off denormalized stuff */ 7682 emm0 = srl (bitcast_to_int(x), 23); 7683 /* keep only the fractional part */ 7684 x = bitcast_to_float (bitcast_to_int(x) & int_t(inv_mant_mask)); 7685 x = bitcast_to_float (bitcast_to_int(x) | bitcast_to_int(T(0.5f))); 7686 emm0 = emm0 - int_t(0x7f); 7687 T e (emm0); 7688 e = e + one; 7689 // OIIO_SIMD_vFLOAT4_CONST (cephes_SQRTHF, 0.707106781186547524f); 7690 const float cephes_SQRTHF (0.707106781186547524f); 7691 bool_t mask = (x < T(cephes_SQRTHF)); 7692 T tmp = bitcast_to_float (bitcast_to_int(x) & bitcast_to_int(mask)); 7693 x = x - one; 7694 e = e - bitcast_to_float (bitcast_to_int(one) & bitcast_to_int(mask)); 7695 x = x + tmp; 7696 T z = x * x; 7697 const float cephes_log_p0 (7.0376836292E-2f); 7698 const float cephes_log_p1 (- 1.1514610310E-1f); 7699 const float cephes_log_p2 (1.1676998740E-1f); 7700 const float cephes_log_p3 (- 1.2420140846E-1f); 7701 const float cephes_log_p4 (+ 1.4249322787E-1f); 7702 const float cephes_log_p5 (- 1.6668057665E-1f); 7703 const float cephes_log_p6 (+ 2.0000714765E-1f); 7704 const float cephes_log_p7 (- 2.4999993993E-1f); 7705 const float cephes_log_p8 (+ 3.3333331174E-1f); 7706 const float cephes_log_q1 (-2.12194440e-4f); 7707 const float cephes_log_q2 (0.693359375f); 7708 T y = cephes_log_p0; 7709 y = madd (y, x, T(cephes_log_p1)); 7710 y = madd (y, x, T(cephes_log_p2)); 7711 y = madd (y, x, T(cephes_log_p3)); 7712 y = madd (y, x, T(cephes_log_p4)); 7713 y = madd (y, x, T(cephes_log_p5)); 7714 y = madd (y, x, T(cephes_log_p6)); 7715 y = madd (y, x, T(cephes_log_p7)); 7716 y = madd (y, x, T(cephes_log_p8)); 7717 y = y * x; 7718 y = y * z; 7719 y = madd(e, T(cephes_log_q1), y); 7720 y = nmadd (z, 0.5f, y); 7721 x = x + y; 7722 x = madd (e, T(cephes_log_q2), x); 7723 x = bitcast_to_float (bitcast_to_int(x) | bitcast_to_int(invalid_mask)); // negative arg will be NAN 7724 return x; 7725 #else 7726 SIMD_RETURN (T, logf(v[i])); 7727 #endif 7728 } 7729 7730 7731 7732 OIIO_FORCEINLINE void transpose (vfloat4 &a, vfloat4 &b, vfloat4 &c, vfloat4 &d) 7733 { 7734 #if OIIO_SIMD_SSE 7735 _MM_TRANSPOSE4_PS (a.simd(), b.simd(), c.simd(), d.simd()); 7736 #else 7737 vfloat4 A (a[0], b[0], c[0], d[0]); 7738 vfloat4 B (a[1], b[1], c[1], d[1]); 7739 vfloat4 C (a[2], b[2], c[2], d[2]); 7740 vfloat4 D (a[3], b[3], c[3], d[3]); 7741 a = A; b = B; c = C; d = D; 7742 #endif 7743 } 7744 7745 7746 OIIO_FORCEINLINE void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d, 7747 vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3) 7748 { 7749 #if OIIO_SIMD_SSE 7750 //_MM_TRANSPOSE4_PS (a, b, c, d); 7751 auto l02 = _mm_unpacklo_ps (a, c); 7752 auto h02 = _mm_unpackhi_ps (a, c); 7753 auto l13 = _mm_unpacklo_ps (b, d); 7754 auto h13 = _mm_unpackhi_ps (b, d); 7755 r0 = vfloat4(_mm_unpacklo_ps (l02, l13)); 7756 r1 = vfloat4(_mm_unpackhi_ps (l02, l13)); 7757 r2 = vfloat4(_mm_unpacklo_ps (h02, h13)); 7758 r3 = vfloat4(_mm_unpackhi_ps (h02, h13)); 7759 #else 7760 r0.load (a[0], b[0], c[0], d[0]); 7761 r1.load (a[1], b[1], c[1], d[1]); 7762 r2.load (a[2], b[2], c[2], d[2]); 7763 r3.load (a[3], b[3], c[3], d[3]); 7764 #endif 7765 } 7766 7767 7768 OIIO_FORCEINLINE void transpose (vint4 &a, vint4 &b, vint4 &c, vint4 &d) 7769 { 7770 #if OIIO_SIMD_SSE 7771 __m128 A = _mm_castsi128_ps (a); 7772 __m128 B = _mm_castsi128_ps (b); 7773 __m128 C = _mm_castsi128_ps (c); 7774 __m128 D = _mm_castsi128_ps (d); 7775 _MM_TRANSPOSE4_PS (A, B, C, D); 7776 a = _mm_castps_si128 (A); 7777 b = _mm_castps_si128 (B); 7778 c = _mm_castps_si128 (C); 7779 d = _mm_castps_si128 (D); 7780 #else 7781 vint4 A (a[0], b[0], c[0], d[0]); 7782 vint4 B (a[1], b[1], c[1], d[1]); 7783 vint4 C (a[2], b[2], c[2], d[2]); 7784 vint4 D (a[3], b[3], c[3], d[3]); 7785 a = A; b = B; c = C; d = D; 7786 #endif 7787 } 7788 7789 OIIO_FORCEINLINE void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d, 7790 vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3) 7791 { 7792 #if OIIO_SIMD_SSE 7793 //_MM_TRANSPOSE4_PS (a, b, c, d); 7794 __m128 A = _mm_castsi128_ps (a); 7795 __m128 B = _mm_castsi128_ps (b); 7796 __m128 C = _mm_castsi128_ps (c); 7797 __m128 D = _mm_castsi128_ps (d); 7798 _MM_TRANSPOSE4_PS (A, B, C, D); 7799 r0 = _mm_castps_si128 (A); 7800 r1 = _mm_castps_si128 (B); 7801 r2 = _mm_castps_si128 (C); 7802 r3 = _mm_castps_si128 (D); 7803 #else 7804 r0.load (a[0], b[0], c[0], d[0]); 7805 r1.load (a[1], b[1], c[1], d[1]); 7806 r2.load (a[2], b[2], c[2], d[2]); 7807 r3.load (a[3], b[3], c[3], d[3]); 7808 #endif 7809 } 7810 7811 7812 OIIO_FORCEINLINE vfloat4 AxBxCxDx (const vfloat4& a, const vfloat4& b, 7813 const vfloat4& c, const vfloat4& d) 7814 { 7815 #if OIIO_SIMD_SSE 7816 vfloat4 l02 = _mm_unpacklo_ps (a, c); 7817 vfloat4 l13 = _mm_unpacklo_ps (b, d); 7818 return _mm_unpacklo_ps (l02, l13); 7819 #else 7820 return vfloat4 (a[0], b[0], c[0], d[0]); 7821 #endif 7822 } 7823 7824 7825 OIIO_FORCEINLINE vint4 AxBxCxDx (const vint4& a, const vint4& b, 7826 const vint4& c, const vint4& d) 7827 { 7828 #if OIIO_SIMD_SSE 7829 vint4 l02 = _mm_unpacklo_epi32 (a, c); 7830 vint4 l13 = _mm_unpacklo_epi32 (b, d); 7831 return _mm_unpacklo_epi32 (l02, l13); 7832 #else 7833 return vint4 (a[0], b[0], c[0], d[0]); 7834 #endif 7835 } 7836 7837 7838 7839 ////////////////////////////////////////////////////////////////////// 7840 // vfloat3 implementation 7841 7842 OIIO_FORCEINLINE vfloat3::vfloat3 (const vfloat3 &other) : vfloat4(other) { 7843 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON 7844 m_simd = other.m_simd; 7845 #else 7846 SIMD_CONSTRUCT_PAD (other[i]); 7847 #endif 7848 } 7849 7850 OIIO_FORCEINLINE vfloat3::vfloat3 (const vfloat4 &other) { 7851 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON 7852 m_simd = other.simd(); 7853 #else 7854 SIMD_CONSTRUCT_PAD (other[i]); 7855 m_val[3] = 0.0f; 7856 #endif 7857 } 7858 7859 OIIO_FORCEINLINE const vfloat3 vfloat3::Zero () { return vfloat3(vfloat4::Zero()); } 7860 7861 OIIO_FORCEINLINE const vfloat3 vfloat3::One () { return vfloat3(1.0f); } 7862 7863 OIIO_FORCEINLINE const vfloat3 vfloat3::Iota (float start, float step) { 7864 return vfloat3 (start+0.0f*step, start+1.0f*step, start+2.0f*step); 7865 } 7866 7867 7868 OIIO_FORCEINLINE void vfloat3::load (float val) { vfloat4::load (val, val, val, 0.0f); } 7869 7870 OIIO_FORCEINLINE void vfloat3::load (const float *values) { vfloat4::load (values, 3); } 7871 7872 OIIO_FORCEINLINE void vfloat3::load (const float *values, int n) { 7873 vfloat4::load (values, n); 7874 } 7875 7876 OIIO_FORCEINLINE void vfloat3::load (const unsigned short *values) { 7877 vfloat4::load (float(values[0]), float(values[1]), float(values[2])); 7878 } 7879 7880 OIIO_FORCEINLINE void vfloat3::load (const short *values) { 7881 vfloat4::load (float(values[0]), float(values[1]), float(values[2])); 7882 } 7883 7884 OIIO_FORCEINLINE void vfloat3::load (const unsigned char *values) { 7885 vfloat4::load (float(values[0]), float(values[1]), float(values[2])); 7886 } 7887 7888 OIIO_FORCEINLINE void vfloat3::load (const char *values) { 7889 vfloat4::load (float(values[0]), float(values[1]), float(values[2])); 7890 } 7891 7892 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 7893 OIIO_FORCEINLINE void vfloat3::load (const half *values) { 7894 vfloat4::load (float(values[0]), float(values[1]), float(values[2])); 7895 } 7896 #endif /* _HALF_H_ or _IMATH_H_ */ 7897 7898 OIIO_FORCEINLINE void vfloat3::store (float *values) const { 7899 vfloat4::store (values, 3); 7900 } 7901 7902 OIIO_FORCEINLINE void vfloat3::store (float *values, int n) const { 7903 vfloat4::store (values, n); 7904 } 7905 7906 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 7907 OIIO_FORCEINLINE void vfloat3::store (half *values) const { 7908 SIMD_DO (values[i] = m_val[i]); 7909 } 7910 #endif 7911 7912 OIIO_FORCEINLINE void vfloat3::store (Imath::V3f &vec) const { 7913 store ((float *)&vec); 7914 } 7915 7916 OIIO_FORCEINLINE vfloat3 operator+ (const vfloat3& a, const vfloat3& b) { 7917 return vfloat3 (vfloat4(a) + vfloat4(b)); 7918 } 7919 7920 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator+= (const vfloat3& a) { 7921 *this = *this + a; return *this; 7922 } 7923 7924 OIIO_FORCEINLINE vfloat3 vfloat3::operator- () const { 7925 return vfloat3 (-vfloat4(*this)); 7926 } 7927 7928 OIIO_FORCEINLINE vfloat3 operator- (const vfloat3& a, const vfloat3& b) { 7929 return vfloat3 (vfloat4(a) - vfloat4(b)); 7930 } 7931 7932 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator-= (const vfloat3& a) { 7933 *this = *this - a; return *this; 7934 } 7935 7936 OIIO_FORCEINLINE vfloat3 operator* (const vfloat3& a, const vfloat3& b) { 7937 return vfloat3 (vfloat4(a) * vfloat4(b)); 7938 } 7939 7940 OIIO_FORCEINLINE vfloat3 operator* (const vfloat3& a, float b) { 7941 return vfloat3 (vfloat4(a) * b); 7942 } 7943 7944 OIIO_FORCEINLINE vfloat3 operator* (float a, const vfloat3& b) { 7945 return b * a; 7946 } 7947 7948 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator*= (const vfloat3& a) { 7949 *this = *this * a; return *this; 7950 } 7951 7952 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator*= (float a) { 7953 *this = *this * a; return *this; 7954 } 7955 7956 OIIO_FORCEINLINE vfloat3 operator/ (const vfloat3& a, const vfloat3& b) { 7957 return vfloat3 (vfloat4(a) / b.xyz1()); // Avoid divide by zero! 7958 } 7959 7960 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator/= (const vfloat3& a) { 7961 *this = *this / a; return *this; 7962 } 7963 7964 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator/= (float a) { 7965 *this = *this / a; return *this; 7966 } 7967 7968 7969 inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val) { 7970 cout << val[0]; 7971 for (int i = 1; i < val.elements; ++i) 7972 cout << ' ' << val[i]; 7973 return cout; 7974 } 7975 7976 7977 OIIO_FORCEINLINE vfloat3 abs (const vfloat3& a) 7978 { 7979 #if OIIO_SIMD_SSE 7980 // Just clear the sign bit for cheap fabsf 7981 return vfloat3(_mm_and_ps (a.simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); 7982 #elif OIIO_SIMD_NEON 7983 return vfloat3(vabsq_f32(a.simd())); 7984 #else 7985 SIMD_RETURN (vfloat3, fabsf(a[i])); 7986 #endif 7987 } 7988 7989 7990 OIIO_FORCEINLINE vfloat3 sign (const vfloat3& a) 7991 { 7992 vfloat3 one(1.0f); 7993 return vfloat3(blend (one, -one, a < vfloat3::Zero())); 7994 } 7995 7996 7997 OIIO_FORCEINLINE vfloat3 ceil (const vfloat3& a) 7998 { 7999 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 8000 return vfloat3(_mm_ceil_ps (a)); 8001 #else 8002 SIMD_RETURN (vfloat3, ceilf(a[i])); 8003 #endif 8004 } 8005 8006 OIIO_FORCEINLINE vfloat3 floor (const vfloat3& a) 8007 { 8008 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 8009 return vfloat3(_mm_floor_ps (a)); 8010 #else 8011 SIMD_RETURN (vfloat3, floorf(a[i])); 8012 #endif 8013 } 8014 8015 OIIO_FORCEINLINE vfloat3 round (const vfloat3& a) 8016 { 8017 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */ 8018 return vfloat3(_mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC))); 8019 #else 8020 SIMD_RETURN (vfloat3, roundf(a[i])); 8021 #endif 8022 } 8023 8024 8025 OIIO_FORCEINLINE vfloat3 vreduce_add (const vfloat3& v) { 8026 #if OIIO_SIMD_SSE 8027 return vfloat3 ((vreduce_add(vfloat4(v))).xyz0()); 8028 #else 8029 return vfloat3 (v[0] + v[1] + v[2]); 8030 #endif 8031 } 8032 8033 8034 OIIO_FORCEINLINE vfloat3 vdot (const vfloat3 &a, const vfloat3 &b) { 8035 #if OIIO_SIMD_SSE >= 4 8036 return vfloat3(_mm_dp_ps (a.simd(), b.simd(), 0x77)); 8037 #else 8038 return vreduce_add (a*b); 8039 #endif 8040 } 8041 8042 8043 OIIO_FORCEINLINE float dot (const vfloat3 &a, const vfloat3 &b) { 8044 #if OIIO_SIMD_SSE >= 4 8045 return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0x77)); 8046 #elif OIIO_SIMD 8047 return reduce_add (a*b); 8048 #else 8049 return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; 8050 #endif 8051 } 8052 8053 8054 OIIO_FORCEINLINE vfloat3 vdot3 (const vfloat3 &a, const vfloat3 &b) { 8055 #if OIIO_SIMD_SSE >= 4 8056 return vfloat3(_mm_dp_ps (a.simd(), b.simd(), 0x77)); 8057 #else 8058 return vfloat3 (vreduce_add((a*b).xyz0()).xyz0()); 8059 #endif 8060 } 8061 8062 8063 OIIO_FORCEINLINE float vfloat3::length2 () const 8064 { 8065 return dot(*this, *this); 8066 } 8067 8068 8069 OIIO_FORCEINLINE float vfloat3::length () const 8070 { 8071 return sqrtf(dot(*this, *this)); 8072 } 8073 8074 8075 OIIO_FORCEINLINE vfloat3 vfloat3::normalized () const 8076 { 8077 #if OIIO_SIMD 8078 vfloat3 len2 = vdot3 (*this, *this); 8079 return vfloat3 (safe_div (*this, sqrt(len2))); 8080 #else 8081 float len2 = dot (*this, *this); 8082 return len2 > 0.0f ? (*this) / sqrtf(len2) : vfloat3::Zero(); 8083 #endif 8084 } 8085 8086 8087 OIIO_FORCEINLINE vfloat3 vfloat3::normalized_fast () const 8088 { 8089 #if OIIO_SIMD 8090 vfloat3 len2 = vdot3 (*this, *this); 8091 vfloat4 invlen = blend0not (rsqrt_fast (len2), len2 == vfloat4::Zero()); 8092 return vfloat3 ((*this) * invlen); 8093 #else 8094 float len2 = dot (*this, *this); 8095 return len2 > 0.0f ? (*this) / sqrtf(len2) : vfloat3::Zero(); 8096 #endif 8097 } 8098 8099 8100 8101 ////////////////////////////////////////////////////////////////////// 8102 // matrix44 implementation 8103 8104 8105 OIIO_FORCEINLINE const Imath::M44f& matrix44::M44f() const { 8106 return *(Imath::M44f*)this; 8107 } 8108 8109 8110 OIIO_FORCEINLINE vfloat4 matrix44::operator[] (int i) const { 8111 #if OIIO_SIMD_SSE 8112 return m_row[i]; 8113 #else 8114 return vfloat4 (m_mat[i]); 8115 #endif 8116 } 8117 8118 8119 OIIO_FORCEINLINE matrix44 matrix44::transposed () const { 8120 matrix44 T; 8121 #if OIIO_SIMD_SSE 8122 simd::transpose (m_row[0], m_row[1], m_row[2], m_row[3], 8123 T.m_row[0], T.m_row[1], T.m_row[2], T.m_row[3]); 8124 #else 8125 T.m_mat = m_mat.transposed(); 8126 #endif 8127 return T; 8128 } 8129 8130 OIIO_FORCEINLINE vfloat3 matrix44::transformp (const vfloat3 &V) const { 8131 #if OIIO_SIMD_SSE 8132 vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] + 8133 shuffle<2>(V) * m_row[2] + m_row[3]; 8134 R = R / shuffle<3>(R); 8135 return vfloat3 (R.xyz0()); 8136 #else 8137 Imath::V3f R; 8138 m_mat.multVecMatrix (*(Imath::V3f *)&V, R); 8139 return vfloat3(R); 8140 #endif 8141 } 8142 8143 OIIO_FORCEINLINE vfloat3 matrix44::transformv (const vfloat3 &V) const { 8144 #if OIIO_SIMD_SSE 8145 vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] + 8146 shuffle<2>(V) * m_row[2]; 8147 return vfloat3 (R.xyz0()); 8148 #else 8149 Imath::V3f R; 8150 m_mat.multDirMatrix (*(Imath::V3f *)&V, R); 8151 return vfloat3(R); 8152 #endif 8153 } 8154 8155 OIIO_FORCEINLINE vfloat3 matrix44::transformvT (const vfloat3 &V) const { 8156 #if OIIO_SIMD_SSE 8157 matrix44 T = transposed(); 8158 vfloat4 R = shuffle<0>(V) * T[0] + shuffle<1>(V) * T[1] + 8159 shuffle<2>(V) * T[2]; 8160 return vfloat3 (R.xyz0()); 8161 #else 8162 Imath::V3f R; 8163 m_mat.transposed().multDirMatrix (*(Imath::V3f *)&V, R); 8164 return vfloat3(R); 8165 #endif 8166 } 8167 8168 OIIO_FORCEINLINE vfloat4 operator* (const vfloat4 &V, const matrix44& M) 8169 { 8170 #if OIIO_SIMD_SSE 8171 return shuffle<0>(V) * M[0] + shuffle<1>(V) * M[1] + 8172 shuffle<2>(V) * M[2] + shuffle<3>(V) * M[3]; 8173 #else 8174 return vfloat4(V.V4f() * M.M44f()); 8175 #endif 8176 } 8177 8178 OIIO_FORCEINLINE vfloat4 operator* (const matrix44& M, const vfloat4 &V) 8179 { 8180 #if OIIO_SIMD_SSE >= 3 8181 vfloat4 m0v = M[0] * V; // [ M00*Vx, M01*Vy, M02*Vz, M03*Vw ] 8182 vfloat4 m1v = M[1] * V; // [ M10*Vx, M11*Vy, M12*Vz, M13*Vw ] 8183 vfloat4 m2v = M[2] * V; // [ M20*Vx, M21*Vy, M22*Vz, M23*Vw ] 8184 vfloat4 m3v = M[3] * V; // [ M30*Vx, M31*Vy, M32*Vz, M33*Vw ] 8185 vfloat4 s01 = _mm_hadd_ps(m0v, m1v); 8186 // [ M00*Vx + M01*Vy, M02*Vz + M03*Vw, M10*Vx + M11*Vy, M12*Vz + M13*Vw ] 8187 vfloat4 s23 = _mm_hadd_ps(m2v, m3v); 8188 // [ M20*Vx + M21*Vy, M22*Vz + M23*Vw, M30*Vx + M31*Vy, M32*Vz + M33*Vw ] 8189 vfloat4 result = _mm_hadd_ps(s01, s23); 8190 // [ M00*Vx + M01*Vy + M02*Vz + M03*Vw, 8191 // M10*Vx + M11*Vy + M12*Vz + M13*Vw, 8192 // M20*Vx + M21*Vy + M22*Vz + M23*Vw, 8193 // M30*Vx + M31*Vy + M32*Vz + M33*Vw ] 8194 return result; 8195 #else 8196 return vfloat4(dot(M[0], V), dot(M[1], V), dot(M[2], V), dot(M[3], V)); 8197 #endif 8198 } 8199 8200 8201 OIIO_FORCEINLINE bool matrix44::operator== (const matrix44& m) const { 8202 #if OIIO_SIMD_SSE 8203 vbool4 b0 = (m_row[0] == m[0]); 8204 vbool4 b1 = (m_row[1] == m[1]); 8205 vbool4 b2 = (m_row[2] == m[2]); 8206 vbool4 b3 = (m_row[3] == m[3]); 8207 return simd::all (b0 & b1 & b2 & b3); 8208 #else 8209 return memcmp(this, &m, 16*sizeof(float)) == 0; 8210 #endif 8211 } 8212 8213 OIIO_FORCEINLINE bool matrix44::operator== (const Imath::M44f& m) const { 8214 return memcmp(this, &m, 16*sizeof(float)) == 0; 8215 } 8216 8217 OIIO_FORCEINLINE bool operator== (const Imath::M44f& a, const matrix44 &b) { 8218 return (b == a); 8219 } 8220 8221 OIIO_FORCEINLINE bool matrix44::operator!= (const matrix44& m) const { 8222 #if OIIO_SIMD_SSE 8223 vbool4 b0 = (m_row[0] != m[0]); 8224 vbool4 b1 = (m_row[1] != m[1]); 8225 vbool4 b2 = (m_row[2] != m[2]); 8226 vbool4 b3 = (m_row[3] != m[3]); 8227 return simd::any (b0 | b1 | b2 | b3); 8228 #else 8229 return memcmp(this, &m, 16*sizeof(float)) != 0; 8230 #endif 8231 } 8232 8233 OIIO_FORCEINLINE bool matrix44::operator!= (const Imath::M44f& m) const { 8234 return memcmp(this, &m, 16*sizeof(float)) != 0; 8235 } 8236 8237 OIIO_FORCEINLINE bool operator!= (const Imath::M44f& a, const matrix44 &b) { 8238 return (b != a); 8239 } 8240 8241 OIIO_FORCEINLINE matrix44 matrix44::inverse() const { 8242 #if OIIO_SIMD_SSE 8243 // Adapted from this code from Intel: 8244 // ftp://download.intel.com/design/pentiumiii/sml/24504301.pdf 8245 vfloat4 minor0, minor1, minor2, minor3; 8246 vfloat4 row0, row1, row2, row3; 8247 vfloat4 det, tmp1; 8248 const float *src = (const float *)this; 8249 vfloat4 zero = vfloat4::Zero(); 8250 tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src)), (__m64*)(src+ 4))); 8251 row1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+8)), (__m64*)(src+12))); 8252 row0 = vfloat4(_mm_shuffle_ps(tmp1, row1, 0x88)); 8253 row1 = vfloat4(_mm_shuffle_ps(row1, tmp1, 0xDD)); 8254 tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6))); 8255 row3 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+10)), (__m64*)(src+14))); 8256 row2 = vfloat4(_mm_shuffle_ps(tmp1, row3, 0x88)); 8257 row3 = vfloat4(_mm_shuffle_ps(row3, tmp1, 0xDD)); 8258 // ----------------------------------------------- 8259 tmp1 = row2 * row3; 8260 tmp1 = shuffle<1,0,3,2>(tmp1); 8261 minor0 = row1 * tmp1; 8262 minor1 = row0 * tmp1; 8263 tmp1 = shuffle<2,3,0,1>(tmp1); 8264 minor0 = (row1 * tmp1) - minor0; 8265 minor1 = (row0 * tmp1) - minor1; 8266 minor1 = shuffle<2,3,0,1>(minor1); 8267 // ----------------------------------------------- 8268 tmp1 = row1 * row2; 8269 tmp1 = shuffle<1,0,3,2>(tmp1); 8270 minor0 = (row3 * tmp1) + minor0; 8271 minor3 = row0 * tmp1; 8272 tmp1 = shuffle<2,3,0,1>(tmp1); 8273 minor0 = minor0 - (row3 * tmp1); 8274 minor3 = (row0 * tmp1) - minor3; 8275 minor3 = shuffle<2,3,0,1>(minor3); 8276 // ----------------------------------------------- 8277 tmp1 = shuffle<2,3,0,1>(row1) * row3; 8278 tmp1 = shuffle<1,0,3,2>(tmp1); 8279 row2 = shuffle<2,3,0,1>(row2); 8280 minor0 = (row2 * tmp1) + minor0; 8281 minor2 = row0 * tmp1; 8282 tmp1 = shuffle<2,3,0,1>(tmp1); 8283 minor0 = minor0 - (row2 * tmp1); 8284 minor2 = (row0 * tmp1) - minor2; 8285 minor2 = shuffle<2,3,0,1>(minor2); 8286 // ----------------------------------------------- 8287 tmp1 = row0 * row1; 8288 tmp1 = shuffle<1,0,3,2>(tmp1); 8289 minor2 = (row3 * tmp1) + minor2; 8290 minor3 = (row2 * tmp1) - minor3; 8291 tmp1 = shuffle<2,3,0,1>(tmp1); 8292 minor2 = (row3 * tmp1) - minor2; 8293 minor3 = minor3 - (row2 * tmp1); 8294 // ----------------------------------------------- 8295 tmp1 = row0 * row3; 8296 tmp1 = shuffle<1,0,3,2>(tmp1); 8297 minor1 = minor1 - (row2 * tmp1); 8298 minor2 = (row1 * tmp1) + minor2; 8299 tmp1 = shuffle<2,3,0,1>(tmp1); 8300 minor1 = (row2 * tmp1) + minor1; 8301 minor2 = minor2 - (row1 * tmp1); 8302 // ----------------------------------------------- 8303 tmp1 = row0 * row2; 8304 tmp1 = shuffle<1,0,3,2>(tmp1); 8305 minor1 = (row3 * tmp1) + minor1; 8306 minor3 = minor3 - (row1 * tmp1); 8307 tmp1 = shuffle<2,3,0,1>(tmp1); 8308 minor1 = minor1 - (row3 * tmp1); 8309 minor3 = (row1 * tmp1) + minor3; 8310 // ----------------------------------------------- 8311 det = row0 * minor0; 8312 det = shuffle<2,3,0,1>(det) + det; 8313 det = vfloat4(_mm_add_ss(shuffle<1,0,3,2>(det), det)); 8314 tmp1 = vfloat4(_mm_rcp_ss(det)); 8315 det = vfloat4(_mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)))); 8316 det = shuffle<0>(det); 8317 return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3); 8318 #else 8319 return matrix44 (m_mat.inverse()); 8320 #endif 8321 } 8322 8323 8324 inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M) { 8325 const float *m = (const float *)&M; 8326 cout << m[0]; 8327 for (int i = 1; i < 16; ++i) 8328 cout << ' ' << m[i]; 8329 return cout; 8330 } 8331 8332 8333 8334 OIIO_FORCEINLINE vfloat3 transformp (const matrix44 &M, const vfloat3 &V) { 8335 return M.transformp (V); 8336 } 8337 8338 OIIO_FORCEINLINE vfloat3 transformp (const Imath::M44f &M, const vfloat3 &V) 8339 { 8340 #if OIIO_SIMD 8341 return matrix44(M).transformp (V); 8342 #else 8343 Imath::V3f R; 8344 M.multVecMatrix (*(const Imath::V3f *)&V, R); 8345 return vfloat3(R); 8346 #endif 8347 } 8348 8349 8350 OIIO_FORCEINLINE vfloat3 transformv (const matrix44 &M, const vfloat3 &V) { 8351 return M.transformv (V); 8352 } 8353 8354 OIIO_FORCEINLINE vfloat3 transformv (const Imath::M44f &M, const vfloat3 &V) 8355 { 8356 #if OIIO_SIMD 8357 return matrix44(M).transformv (V); 8358 #else 8359 Imath::V3f R; 8360 M.multDirMatrix (*(const Imath::V3f *)&V, R); 8361 return vfloat3(R); 8362 #endif 8363 } 8364 8365 OIIO_FORCEINLINE vfloat3 transformvT (const matrix44 &M, const vfloat3 &V) 8366 { 8367 return M.transformvT (V); 8368 } 8369 8370 OIIO_FORCEINLINE vfloat3 transformvT (const Imath::M44f &M, const vfloat3 &V) 8371 { 8372 #if OIIO_SIMD 8373 return matrix44(M).transformvT(V); 8374 #else 8375 return transformv (M.transposed(), V); 8376 #endif 8377 } 8378 8379 8380 8381 ////////////////////////////////////////////////////////////////////// 8382 // vfloat8 implementation 8383 8384 OIIO_FORCEINLINE float& vfloat8::operator[] (int i) { 8385 OIIO_DASSERT(i<elements); 8386 return m_val[i]; 8387 } 8388 8389 OIIO_FORCEINLINE float vfloat8::operator[] (int i) const { 8390 OIIO_DASSERT(i<elements); 8391 return m_val[i]; 8392 } 8393 8394 8395 inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val) { 8396 cout << val[0]; 8397 for (int i = 1; i < val.elements; ++i) 8398 cout << ' ' << val[i]; 8399 return cout; 8400 } 8401 8402 8403 OIIO_FORCEINLINE vfloat4 vfloat8::lo () const { 8404 #if OIIO_SIMD_AVX 8405 return _mm256_castps256_ps128 (simd()); 8406 #else 8407 return m_4[0]; 8408 #endif 8409 } 8410 8411 OIIO_FORCEINLINE vfloat4 vfloat8::hi () const { 8412 #if OIIO_SIMD_AVX 8413 return _mm256_extractf128_ps (simd(), 1); 8414 #else 8415 return m_4[1]; 8416 #endif 8417 } 8418 8419 8420 OIIO_FORCEINLINE vfloat8::vfloat8 (const vfloat4& lo, const vfloat4 &hi) { 8421 #if OIIO_SIMD_AVX 8422 __m256 r = _mm256_castps128_ps256 (lo); 8423 m_simd = _mm256_insertf128_ps (r, hi, 1); 8424 // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo); 8425 // FIXME: when would that not be available? 8426 #else 8427 m_4[0] = lo; 8428 m_4[1] = hi; 8429 #endif 8430 } 8431 8432 8433 OIIO_FORCEINLINE vfloat8::vfloat8 (const vint8& ival) { 8434 #if OIIO_SIMD_AVX 8435 m_simd = _mm256_cvtepi32_ps (ival); 8436 #else 8437 SIMD_CONSTRUCT (float(ival[i])); 8438 #endif 8439 } 8440 8441 8442 OIIO_FORCEINLINE const vfloat8 vfloat8::Zero () { 8443 #if OIIO_SIMD_AVX 8444 return _mm256_setzero_ps(); 8445 #else 8446 return vfloat8(0.0f); 8447 #endif 8448 } 8449 8450 OIIO_FORCEINLINE const vfloat8 vfloat8::One () { 8451 return vfloat8(1.0f); 8452 } 8453 8454 OIIO_FORCEINLINE const vfloat8 vfloat8::Iota (float start, float step) { 8455 return vfloat8 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step, 8456 start+4.0f*step, start+5.0f*step, start+6.0f*step, start+7.0f*step); 8457 } 8458 8459 /// Set all components to 0.0 8460 OIIO_FORCEINLINE void vfloat8::clear () { 8461 #if OIIO_SIMD_AVX 8462 m_simd = _mm256_setzero_ps(); 8463 #else 8464 load (0.0f); 8465 #endif 8466 } 8467 8468 8469 8470 OIIO_FORCEINLINE void vfloat8::load (float val) { 8471 #if OIIO_SIMD_AVX 8472 m_simd = _mm256_set1_ps (val); 8473 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8474 m_4[0].load(val); 8475 m_4[1].load(val); 8476 #else 8477 SIMD_CONSTRUCT (val); 8478 #endif 8479 } 8480 8481 OIIO_FORCEINLINE void vfloat8::load (float a, float b, float c, float d, 8482 float e, float f, float g, float h) { 8483 #if OIIO_SIMD_AVX 8484 m_simd = _mm256_set_ps (h, g, f, e, d, c, b, a); 8485 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8486 m_4[0].load(a, b, c, d); 8487 m_4[1].load(e, f, g, h); 8488 #else 8489 m_val[0] = a; 8490 m_val[1] = b; 8491 m_val[2] = c; 8492 m_val[3] = d; 8493 m_val[4] = e; 8494 m_val[5] = f; 8495 m_val[6] = g; 8496 m_val[7] = h; 8497 #endif 8498 } 8499 8500 8501 OIIO_FORCEINLINE void vfloat8::load (const float *values) { 8502 #if OIIO_SIMD_AVX 8503 m_simd = _mm256_loadu_ps (values); 8504 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8505 m_4[0].load(values); 8506 m_4[1].load(values+4); 8507 #else 8508 SIMD_CONSTRUCT (values[i]); 8509 #endif 8510 } 8511 8512 8513 OIIO_FORCEINLINE void vfloat8::load (const float *values, int n) { 8514 OIIO_DASSERT (n >= 0 && n <= elements); 8515 #if 0 && OIIO_AVX512VL_ENABLED 8516 // This SHOULD be fast, but in my benchmarks, it is slower! 8517 // (At least on the AVX512 hardware I have, Xeon Silver 4110.) 8518 // Re-test this periodically with new Intel hardware. 8519 m_simd = _mm256_maskz_loadu_ps ((~(0xff << n)), values); 8520 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8521 if (n > 4) { 8522 vfloat4 lo, hi; 8523 lo.load (values); 8524 hi.load (values+4, n-4); 8525 m_4[0] = lo; 8526 m_4[1] = hi; 8527 } else { 8528 vfloat4 lo, hi; 8529 lo.load (values, n); 8530 hi.clear(); 8531 m_4[0] = lo; 8532 m_4[1] = hi; 8533 } 8534 #else 8535 for (int i = 0; i < n; ++i) 8536 m_val[i] = values[i]; 8537 for (int i = n; i < paddedelements; ++i) 8538 m_val[i] = 0; 8539 #endif 8540 } 8541 8542 8543 OIIO_FORCEINLINE void vfloat8::load (const unsigned short *values) { 8544 #if OIIO_SIMD_AVX 8545 // Rely on the ushort->int conversion, then convert to float 8546 m_simd = _mm256_cvtepi32_ps (vint8(values).simd()); 8547 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8548 m_4[0].load(values); 8549 m_4[1].load(values+4); 8550 #else 8551 SIMD_CONSTRUCT (values[i]); 8552 #endif 8553 } 8554 8555 8556 OIIO_FORCEINLINE void vfloat8::load (const short *values) { 8557 #if OIIO_SIMD_AVX 8558 // Rely on the short->int conversion, then convert to float 8559 m_simd = _mm256_cvtepi32_ps (vint8(values).simd()); 8560 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8561 m_4[0].load(values); 8562 m_4[1].load(values+4); 8563 #else 8564 SIMD_CONSTRUCT (values[i]); 8565 #endif 8566 } 8567 8568 8569 OIIO_FORCEINLINE void vfloat8::load (const unsigned char *values) { 8570 #if OIIO_SIMD_AVX 8571 m_simd = _mm256_cvtepi32_ps (vint8(values).simd()); 8572 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8573 m_4[0].load(values); 8574 m_4[1].load(values+4); 8575 #else 8576 SIMD_CONSTRUCT (values[i]); 8577 #endif 8578 } 8579 8580 8581 OIIO_FORCEINLINE void vfloat8::load (const char *values) { 8582 #if OIIO_SIMD_AVX 8583 m_simd = _mm256_cvtepi32_ps (vint8(values).simd()); 8584 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8585 m_4[0].load(values); 8586 m_4[1].load(values+4); 8587 #else 8588 SIMD_CONSTRUCT (values[i]); 8589 #endif 8590 } 8591 8592 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 8593 OIIO_FORCEINLINE void vfloat8::load (const half *values) { 8594 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED 8595 /* Enabled 16 bit float instructions! */ 8596 vint4 a ((const int *)values); 8597 m_simd = _mm256_cvtph_ps (a); 8598 #elif OIIO_SIMD_SSE >= 2 8599 m_4[0] = vfloat4(values); 8600 m_4[1] = vfloat4(values+4); 8601 #else /* No SIMD defined: */ 8602 SIMD_CONSTRUCT (values[i]); 8603 #endif 8604 } 8605 #endif /* _HALF_H_ or _IMATH_H_ */ 8606 8607 8608 OIIO_FORCEINLINE void vfloat8::store (float *values) const { 8609 #if OIIO_SIMD_AVX 8610 // Use an unaligned store -- it's just as fast when the memory turns 8611 // out to be aligned, nearly as fast even when unaligned. Not worth 8612 // the headache of using stores that require alignment. 8613 _mm256_storeu_ps (values, m_simd); 8614 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8615 m_4[0].store(values); 8616 m_4[1].store(values+4); 8617 #else 8618 SIMD_DO (values[i] = m_val[i]); 8619 #endif 8620 } 8621 8622 8623 OIIO_FORCEINLINE void vfloat8::store (float *values, int n) const { 8624 OIIO_DASSERT (n >= 0 && n <= elements); 8625 #if 0 && OIIO_AVX512VL_ENABLED 8626 // This SHOULD be fast, but in my benchmarks, it is slower! 8627 // (At least on the AVX512 hardware I have, Xeon Silver 4110.) 8628 // Re-test this periodically with new Intel hardware. 8629 _mm256_mask_storeu_ps (values, __mmask8(~(0xff << n)), m_simd); 8630 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8631 if (n <= 4) { 8632 lo().store (values, n); 8633 } else if (n <= 8) { 8634 lo().store (values); 8635 hi().store (values+4, n-4); 8636 } 8637 #else 8638 for (int i = 0; i < n; ++i) 8639 values[i] = m_val[i]; 8640 #endif 8641 } 8642 8643 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 8644 OIIO_FORCEINLINE void vfloat8::store (half *values) const { 8645 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED 8646 __m128i h = _mm256_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); 8647 _mm_storeu_si128 ((__m128i *)values, h); 8648 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8649 m_4[0].store(values); 8650 m_4[1].store(values+4); 8651 #else 8652 SIMD_DO (values[i] = m_val[i]); 8653 #endif 8654 } 8655 #endif 8656 8657 8658 OIIO_FORCEINLINE void vfloat8::load_mask (int mask, const float *values) { 8659 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 8660 m_simd = _mm256_maskz_loadu_ps (__mmask8(mask), (const simd_t *)values); 8661 #elif OIIO_SIMD_AVX 8662 m_simd = _mm256_maskload_ps (values, _mm256_castps_si256(vbool8::from_bitmask(mask))); 8663 #else 8664 SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0.0f); 8665 #endif 8666 } 8667 8668 8669 OIIO_FORCEINLINE void vfloat8::load_mask (const vbool8& mask, const float *values) { 8670 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 8671 m_simd = _mm256_maskz_loadu_ps (__mmask8(mask.bitmask()), (const simd_t *)values); 8672 #elif OIIO_SIMD_AVX 8673 m_simd = _mm256_maskload_ps (values, _mm256_castps_si256(mask)); 8674 #else 8675 SIMD_CONSTRUCT (mask[i] ? values[i] : 0.0f); 8676 #endif 8677 } 8678 8679 8680 OIIO_FORCEINLINE void vfloat8::store_mask (int mask, float *values) const { 8681 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 8682 _mm256_mask_storeu_ps (values, __mmask8(mask), m_simd); 8683 #elif OIIO_SIMD_AVX 8684 _mm256_maskstore_ps (values, _mm256_castps_si256(vbool8::from_bitmask(mask)), m_simd); 8685 #else 8686 SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]); 8687 #endif 8688 } 8689 8690 8691 OIIO_FORCEINLINE void vfloat8::store_mask (const vbool8& mask, float *values) const { 8692 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 8693 _mm256_mask_storeu_ps (values, __mmask8(mask.bitmask()), m_simd); 8694 #elif OIIO_SIMD_AVX 8695 _mm256_maskstore_ps (values, _mm256_castps_si256(mask.simd()), m_simd); 8696 #else 8697 SIMD_DO (if (mask[i]) values[i] = (*this)[i]); 8698 #endif 8699 } 8700 8701 8702 template <int scale> 8703 OIIO_FORCEINLINE void 8704 vfloat8::gather (const value_t *baseptr, const vint_t& vindex) 8705 { 8706 #if OIIO_SIMD_AVX >= 2 8707 m_simd = _mm256_i32gather_ps (baseptr, vindex, scale); 8708 #else 8709 SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale)); 8710 #endif 8711 } 8712 8713 template<int scale> 8714 OIIO_FORCEINLINE void 8715 vfloat8::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex) 8716 { 8717 #if OIIO_SIMD_AVX >= 2 8718 m_simd = _mm256_mask_i32gather_ps (m_simd, baseptr, vindex, mask, scale); 8719 #else 8720 SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0); 8721 #endif 8722 } 8723 8724 template<int scale> 8725 OIIO_FORCEINLINE void 8726 vfloat8::scatter (value_t *baseptr, const vint_t& vindex) const 8727 { 8728 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 8729 _mm256_i32scatter_ps (baseptr, vindex, m_simd, scale); 8730 #else 8731 SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]); 8732 #endif 8733 } 8734 8735 template<int scale> 8736 OIIO_FORCEINLINE void 8737 vfloat8::scatter_mask (const bool_t& mask, value_t *baseptr, 8738 const vint_t& vindex) const 8739 { 8740 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 8741 _mm256_mask_i32scatter_ps (baseptr, mask.bitmask(), vindex, m_simd, scale); 8742 #else 8743 SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]); 8744 #endif 8745 } 8746 8747 8748 8749 OIIO_FORCEINLINE vfloat8 operator+ (const vfloat8& a, const vfloat8& b) { 8750 #if OIIO_SIMD_AVX 8751 return _mm256_add_ps (a, b); 8752 #else 8753 return vfloat8 (a.lo()+b.lo(), a.hi()+b.hi()); 8754 #endif 8755 } 8756 8757 OIIO_FORCEINLINE const vfloat8 & operator+= (vfloat8 & a, const vfloat8& b) { 8758 return a = a + b; 8759 } 8760 8761 OIIO_FORCEINLINE vfloat8 operator- (const vfloat8& a) { 8762 #if OIIO_SIMD_AVX 8763 return _mm256_sub_ps (_mm256_setzero_ps(), a); 8764 #else 8765 return vfloat8 (-a.lo(), -a.hi()); 8766 #endif 8767 } 8768 8769 OIIO_FORCEINLINE vfloat8 operator- (const vfloat8& a, const vfloat8& b) { 8770 #if OIIO_SIMD_AVX 8771 return _mm256_sub_ps (a, b); 8772 #else 8773 return vfloat8 (a.lo()-b.lo(), a.hi()-b.hi()); 8774 #endif 8775 } 8776 8777 OIIO_FORCEINLINE const vfloat8 & operator-= (vfloat8 & a, const vfloat8& b) { 8778 return a = a - b; 8779 } 8780 8781 OIIO_FORCEINLINE vfloat8 operator* (const vfloat8& a, float b) { 8782 #if OIIO_SIMD_AVX 8783 return _mm256_mul_ps (a.m_simd, _mm256_set1_ps(b)); 8784 #else 8785 return vfloat8 (a.lo()*b, a.hi()*b); 8786 #endif 8787 } 8788 8789 OIIO_FORCEINLINE vfloat8 operator* (float a, const vfloat8& b) { 8790 return b * a; 8791 } 8792 8793 OIIO_FORCEINLINE vfloat8 operator* (const vfloat8& a, const vfloat8& b) { 8794 #if OIIO_SIMD_AVX 8795 return _mm256_mul_ps (a, b); 8796 #else 8797 return vfloat8 (a.lo()*b.lo(), a.hi()*b.hi()); 8798 #endif 8799 } 8800 8801 OIIO_FORCEINLINE const vfloat8 & operator*= (vfloat8 & a, const vfloat8& b) { 8802 return a = a * b; 8803 } 8804 8805 OIIO_FORCEINLINE vfloat8 operator/ (const vfloat8& a, const vfloat8& b) { 8806 #if OIIO_SIMD_AVX 8807 return _mm256_div_ps (a, b); 8808 #else 8809 return vfloat8 (a.lo()/b.lo(), a.hi()/b.hi()); 8810 #endif 8811 } 8812 8813 OIIO_FORCEINLINE const vfloat8 & operator/= (vfloat8 & a, const vfloat8& b) { 8814 return a = a / b; 8815 } 8816 8817 OIIO_FORCEINLINE vbool8 operator== (const vfloat8& a, const vfloat8& b) { 8818 #if OIIO_SIMD_AVX 8819 return _mm256_cmp_ps (a, b, _CMP_EQ_OQ); 8820 #else 8821 return vbool8 (a.lo() == b.lo(), a.hi() == b.hi()); 8822 #endif 8823 } 8824 8825 OIIO_FORCEINLINE vbool8 operator!= (const vfloat8& a, const vfloat8& b) { 8826 #if OIIO_SIMD_AVX 8827 return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); 8828 #else 8829 return vbool8 (a.lo() != b.lo(), a.hi() != b.hi()); 8830 #endif 8831 } 8832 8833 OIIO_FORCEINLINE vbool8 operator< (const vfloat8& a, const vfloat8& b) { 8834 #if OIIO_SIMD_AVX 8835 return _mm256_cmp_ps (a, b, _CMP_LT_OQ); 8836 #else 8837 return vbool8 (a.lo() < b.lo(), a.hi() < b.hi()); 8838 #endif 8839 } 8840 8841 OIIO_FORCEINLINE vbool8 operator> (const vfloat8& a, const vfloat8& b) { 8842 #if OIIO_SIMD_AVX 8843 return _mm256_cmp_ps (a, b, _CMP_GT_OQ); 8844 #else 8845 return vbool8 (a.lo() > b.lo(), a.hi() > b.hi()); 8846 #endif 8847 } 8848 8849 OIIO_FORCEINLINE vbool8 operator>= (const vfloat8& a, const vfloat8& b) { 8850 #if OIIO_SIMD_AVX 8851 return _mm256_cmp_ps (a, b, _CMP_GE_OQ); 8852 #else 8853 return vbool8 (a.lo() >= b.lo(), a.hi() >= b.hi()); 8854 #endif 8855 } 8856 8857 OIIO_FORCEINLINE vbool8 operator<= (const vfloat8& a, const vfloat8& b) { 8858 #if OIIO_SIMD_AVX 8859 return _mm256_cmp_ps (a, b, _CMP_LE_OQ); 8860 #else 8861 return vbool8 (a.lo() <= b.lo(), a.hi() <= b.hi()); 8862 #endif 8863 } 8864 8865 8866 // Implementation had to be after the definition of vfloat8. 8867 OIIO_FORCEINLINE vint8::vint8 (const vfloat8& f) 8868 { 8869 #if OIIO_SIMD_AVX 8870 m_simd = _mm256_cvttps_epi32(f); 8871 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8872 *this = vint8 (vint4(f.lo()), vint4(f.hi())); 8873 #else 8874 SIMD_CONSTRUCT ((int) f[i]); 8875 #endif 8876 } 8877 8878 8879 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 8880 OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) { 8881 #if OIIO_SIMD_AVX >= 2 8882 vint8 index (i0, i1, i2, i3, i4, i5, i6, i7); 8883 return _mm256_permutevar8x32_ps (a, index); 8884 #else 8885 return vfloat8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]); 8886 #endif 8887 } 8888 8889 template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) { 8890 #if OIIO_SIMD_AVX >= 2 8891 return _mm256_permutevar8x32_ps (a, vint8(i)); 8892 #else 8893 return shuffle<i,i,i,i,i,i,i,i>(a); 8894 #endif 8895 } 8896 8897 8898 template<int i> 8899 OIIO_FORCEINLINE float extract (const vfloat8& v) { 8900 #if OIIO_SIMD_AVX_NO_FIXME 8901 // Looks like the fastest we can do it is to extract a vfloat4, 8902 // shuffle its one element everywhere, then extract element 0. 8903 _m128 f4 = _mm256_extractf128_ps (i >> 2); 8904 int j = i & 3; 8905 return _mm_cvtss_f32(shuffle_sse<j,j,j,j>(a.simd())); 8906 #else 8907 return v[i]; 8908 #endif 8909 } 8910 8911 8912 template<int i> 8913 OIIO_FORCEINLINE vfloat8 insert (const vfloat8& a, float val) { 8914 #if OIIO_SIMD_AVX_NO_FIXME 8915 return _mm256_insert_epi32 (a, val, i); 8916 #else 8917 vfloat8 tmp = a; 8918 tmp[i] = val; 8919 return tmp; 8920 #endif 8921 } 8922 8923 8924 OIIO_FORCEINLINE float vfloat8::x () const { return extract<0>(*this); } 8925 OIIO_FORCEINLINE float vfloat8::y () const { return extract<1>(*this); } 8926 OIIO_FORCEINLINE float vfloat8::z () const { return extract<2>(*this); } 8927 OIIO_FORCEINLINE float vfloat8::w () const { return extract<3>(*this); } 8928 OIIO_FORCEINLINE void vfloat8::set_x (float val) { *this = insert<0>(*this, val); } 8929 OIIO_FORCEINLINE void vfloat8::set_y (float val) { *this = insert<1>(*this, val); } 8930 OIIO_FORCEINLINE void vfloat8::set_z (float val) { *this = insert<2>(*this, val); } 8931 OIIO_FORCEINLINE void vfloat8::set_w (float val) { *this = insert<3>(*this, val); } 8932 8933 8934 OIIO_FORCEINLINE vint8 bitcast_to_int (const vfloat8& x) 8935 { 8936 #if OIIO_SIMD_AVX 8937 return _mm256_castps_si256 (x.simd()); 8938 #else 8939 return *(vint8 *)&x; 8940 #endif 8941 } 8942 8943 OIIO_FORCEINLINE vfloat8 bitcast_to_float (const vint8& x) 8944 { 8945 #if OIIO_SIMD_AVX 8946 return _mm256_castsi256_ps (x.simd()); 8947 #else 8948 return *(vfloat8 *)&x; 8949 #endif 8950 } 8951 8952 8953 OIIO_FORCEINLINE vfloat8 vreduce_add (const vfloat8& v) { 8954 #if OIIO_SIMD_AVX 8955 // From Syrah: 8956 vfloat8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_ps(v.simd(), _mm256_setzero_ps()); 8957 vfloat8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_ps(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_ps()); 8958 // get efgh in the 0-idx slot 8959 vfloat8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0); 8960 vfloat8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh; 8961 return shuffle<0>(final_sum); 8962 #else 8963 vfloat4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi()); 8964 return vfloat8(hadd4, hadd4); 8965 #endif 8966 } 8967 8968 8969 OIIO_FORCEINLINE float reduce_add (const vfloat8& v) { 8970 #if OIIO_SIMD_AVX >= 2 8971 return extract<0>(vreduce_add(v)); 8972 #else 8973 return reduce_add(v.lo()) + reduce_add(v.hi()); 8974 #endif 8975 } 8976 8977 8978 OIIO_FORCEINLINE vfloat8 blend (const vfloat8& a, const vfloat8& b, const vbool8& mask) 8979 { 8980 #if OIIO_SIMD_AVX 8981 return _mm256_blendv_ps (a, b, mask); 8982 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8983 return vfloat8 (blend (a.lo(), b.lo(), mask.lo()), 8984 blend (a.hi(), b.hi(), mask.hi())); 8985 #else 8986 SIMD_RETURN (vfloat8, mask[i] ? b[i] : a[i]); 8987 #endif 8988 } 8989 8990 8991 OIIO_FORCEINLINE vfloat8 blend0 (const vfloat8& a, const vbool8& mask) 8992 { 8993 #if OIIO_SIMD_AVX 8994 return _mm256_and_ps(mask, a); 8995 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 8996 return vfloat8 (blend0 (a.lo(), mask.lo()), 8997 blend0 (a.hi(), mask.hi())); 8998 #else 8999 SIMD_RETURN (vfloat8, mask[i] ? a[i] : 0.0f); 9000 #endif 9001 } 9002 9003 9004 OIIO_FORCEINLINE vfloat8 blend0not (const vfloat8& a, const vbool8& mask) 9005 { 9006 #if OIIO_SIMD_AVX 9007 return _mm256_andnot_ps(mask, a); 9008 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 9009 return vfloat8 (blend0not (a.lo(), mask.lo()), 9010 blend0not (a.hi(), mask.hi())); 9011 #else 9012 SIMD_RETURN (vfloat8, mask[i] ? 0.0f : a[i]); 9013 #endif 9014 } 9015 9016 9017 OIIO_FORCEINLINE vfloat8 select (const vbool8& mask, const vfloat8& a, const vfloat8& b) 9018 { 9019 return blend (b, a, mask); 9020 } 9021 9022 9023 OIIO_FORCEINLINE vfloat8 safe_div (const vfloat8 &a, const vfloat8 &b) { 9024 #if OIIO_SIMD_SSE 9025 return blend0not (a/b, b == vfloat8::Zero()); 9026 #else 9027 SIMD_RETURN (vfloat8, b[i] == 0.0f ? 0.0f : a[i] / b[i]); 9028 #endif 9029 } 9030 9031 9032 OIIO_FORCEINLINE vfloat8 abs (const vfloat8& a) 9033 { 9034 #if OIIO_SIMD_AVX 9035 // Just clear the sign bit for cheap fabsf 9036 return _mm256_and_ps (a.simd(), _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); 9037 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 9038 return vfloat8(abs(a.lo()), abs(a.hi())); 9039 #else 9040 SIMD_RETURN (vfloat8, fabsf(a[i])); 9041 #endif 9042 } 9043 9044 9045 OIIO_FORCEINLINE vfloat8 sign (const vfloat8& a) 9046 { 9047 vfloat8 one(1.0f); 9048 return blend (one, -one, a < vfloat8::Zero()); 9049 } 9050 9051 9052 OIIO_FORCEINLINE vfloat8 ceil (const vfloat8& a) 9053 { 9054 #if OIIO_SIMD_AVX 9055 return _mm256_ceil_ps (a); 9056 #else 9057 SIMD_RETURN (vfloat8, ceilf(a[i])); 9058 #endif 9059 } 9060 9061 OIIO_FORCEINLINE vfloat8 floor (const vfloat8& a) 9062 { 9063 #if OIIO_SIMD_AVX 9064 return _mm256_floor_ps (a); 9065 #else 9066 SIMD_RETURN (vfloat8, floorf(a[i])); 9067 #endif 9068 } 9069 9070 OIIO_FORCEINLINE vfloat8 round (const vfloat8& a) 9071 { 9072 #if OIIO_SIMD_AVX 9073 return _mm256_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); 9074 #else 9075 SIMD_RETURN (vfloat8, roundf(a[i])); 9076 #endif 9077 } 9078 9079 OIIO_FORCEINLINE vint8 ifloor (const vfloat8& a) 9080 { 9081 // FIXME: look into this, versus the method of quick_floor in texturesys.cpp 9082 #if OIIO_SIMD_AVX 9083 return vint8(floor(a)); 9084 #elif OIIO_SIMD_SSE /* SSE2/3 */ 9085 return vint8 (ifloor(a.lo()), ifloor(a.hi())); 9086 #else 9087 SIMD_RETURN (vint8, (int)floorf(a[i])); 9088 #endif 9089 } 9090 9091 9092 OIIO_FORCEINLINE vint8 rint (const vfloat8& a) 9093 { 9094 return vint8 (round(a)); 9095 } 9096 9097 9098 9099 OIIO_FORCEINLINE vfloat8 rcp_fast (const vfloat8 &a) 9100 { 9101 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED 9102 vfloat8 r = _mm256_rcp14_ps(a); 9103 return r * nmadd(r,a,vfloat8(2.0f)); 9104 #elif OIIO_SIMD_AVX 9105 vfloat8 r = _mm256_rcp_ps(a); 9106 return r * nmadd(r,a,vfloat8(2.0f)); 9107 #else 9108 return vfloat8(rcp_fast(a.lo()), rcp_fast(a.hi())); 9109 #endif 9110 } 9111 9112 9113 OIIO_FORCEINLINE vfloat8 sqrt (const vfloat8 &a) 9114 { 9115 #if OIIO_SIMD_AVX 9116 return _mm256_sqrt_ps (a.simd()); 9117 #else 9118 SIMD_RETURN (vfloat8, sqrtf(a[i])); 9119 #endif 9120 } 9121 9122 9123 9124 OIIO_FORCEINLINE vfloat8 rsqrt (const vfloat8 &a) 9125 { 9126 #if OIIO_SIMD_AVX 9127 return _mm256_div_ps (_mm256_set1_ps(1.0f), _mm256_sqrt_ps (a.simd())); 9128 #else 9129 SIMD_RETURN (vfloat8, 1.0f/sqrtf(a[i])); 9130 #endif 9131 } 9132 9133 9134 9135 OIIO_FORCEINLINE vfloat8 rsqrt_fast (const vfloat8 &a) 9136 { 9137 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED 9138 // Trickery: in and out of the 512 bit registers to use fast approx rsqrt 9139 return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC)); 9140 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED 9141 // Trickery: in and out of the 512 bit registers to use fast approx rsqrt 9142 return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a))); 9143 #elif OIIO_SIMD_AVX 9144 return _mm256_rsqrt_ps (a.simd()); 9145 #elif OIIO_SIMD_SSE 9146 return vfloat8 (rsqrt_fast(a.lo()), rsqrt_fast(a.hi())); 9147 #else 9148 SIMD_RETURN (vfloat8, 1.0f/sqrtf(a[i])); 9149 #endif 9150 } 9151 9152 9153 9154 OIIO_FORCEINLINE vfloat8 min (const vfloat8& a, const vfloat8& b) 9155 { 9156 #if OIIO_SIMD_AVX 9157 return _mm256_min_ps (a, b); 9158 #else 9159 return vfloat8 (min(a.lo(), b.lo()), min(a.hi(), b.hi())); 9160 #endif 9161 } 9162 9163 OIIO_FORCEINLINE vfloat8 max (const vfloat8& a, const vfloat8& b) 9164 { 9165 #if OIIO_SIMD_AVX 9166 return _mm256_max_ps (a, b); 9167 #else 9168 return vfloat8 (max(a.lo(), b.lo()), max(a.hi(), b.hi())); 9169 #endif 9170 } 9171 9172 9173 OIIO_FORCEINLINE vfloat8 andnot (const vfloat8& a, const vfloat8& b) { 9174 #if OIIO_SIMD_AVX 9175 return _mm256_andnot_ps (a.simd(), b.simd()); 9176 #else 9177 const int *ai = (const int *)&a; 9178 const int *bi = (const int *)&b; 9179 return bitcast_to_float (vint8(~(ai[0]) & bi[0], 9180 ~(ai[1]) & bi[1], 9181 ~(ai[2]) & bi[2], 9182 ~(ai[3]) & bi[3], 9183 ~(ai[4]) & bi[4], 9184 ~(ai[5]) & bi[5], 9185 ~(ai[6]) & bi[6], 9186 ~(ai[7]) & bi[7])); 9187 #endif 9188 } 9189 9190 9191 OIIO_FORCEINLINE vfloat8 madd (const simd::vfloat8& a, const simd::vfloat8& b, 9192 const simd::vfloat8& c) 9193 { 9194 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED 9195 // If we are sure _mm256_fmadd_ps intrinsic is available, use it. 9196 return _mm256_fmadd_ps (a, b, c); 9197 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 9198 return vfloat8 (madd(a.lo(), b.lo(), c.lo()), 9199 madd(a.hi(), b.hi(), c.hi())); 9200 #else 9201 // Fallback: just use regular math and hope for the best. 9202 return a * b + c; 9203 #endif 9204 } 9205 9206 9207 OIIO_FORCEINLINE vfloat8 msub (const simd::vfloat8& a, const simd::vfloat8& b, 9208 const simd::vfloat8& c) 9209 { 9210 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED 9211 // If we are sure _mm256_fnmsub_ps intrinsic is available, use it. 9212 return _mm256_fmsub_ps (a, b, c); 9213 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 9214 return vfloat8 (msub(a.lo(), b.lo(), c.lo()), 9215 msub(a.hi(), b.hi(), c.hi())); 9216 #else 9217 // Fallback: just use regular math and hope for the best. 9218 return a * b - c; 9219 #endif 9220 } 9221 9222 9223 9224 OIIO_FORCEINLINE vfloat8 nmadd (const simd::vfloat8& a, const simd::vfloat8& b, 9225 const simd::vfloat8& c) 9226 { 9227 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED 9228 // If we are sure _mm256_fnmadd_ps intrinsic is available, use it. 9229 return _mm256_fnmadd_ps (a, b, c); 9230 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 9231 return vfloat8 (nmadd(a.lo(), b.lo(), c.lo()), 9232 nmadd(a.hi(), b.hi(), c.hi())); 9233 #else 9234 // Fallback: just use regular math and hope for the best. 9235 return c - a * b; 9236 #endif 9237 } 9238 9239 9240 9241 OIIO_FORCEINLINE vfloat8 nmsub (const simd::vfloat8& a, const simd::vfloat8& b, 9242 const simd::vfloat8& c) 9243 { 9244 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED 9245 // If we are sure _mm256_fnmsub_ps intrinsic is available, use it. 9246 return _mm256_fnmsub_ps (a, b, c); 9247 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON 9248 return vfloat8 (nmsub(a.lo(), b.lo(), c.lo()), 9249 nmsub(a.hi(), b.hi(), c.hi())); 9250 #else 9251 // Fallback: just use regular math and hope for the best. 9252 return -(a * b) - c; 9253 #endif 9254 } 9255 9256 9257 9258 9259 ////////////////////////////////////////////////////////////////////// 9260 // vfloat16 implementation 9261 9262 OIIO_FORCEINLINE float& vfloat16::operator[] (int i) { 9263 OIIO_DASSERT(i<elements); 9264 return m_val[i]; 9265 } 9266 9267 OIIO_FORCEINLINE float vfloat16::operator[] (int i) const { 9268 OIIO_DASSERT(i<elements); 9269 return m_val[i]; 9270 } 9271 9272 9273 inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val) { 9274 cout << val[0]; 9275 for (int i = 1; i < val.elements; ++i) 9276 cout << ' ' << val[i]; 9277 return cout; 9278 } 9279 9280 9281 OIIO_FORCEINLINE vfloat8 vfloat16::lo () const { 9282 #if OIIO_SIMD_AVX >= 512 9283 return _mm512_castps512_ps256 (simd()); 9284 #else 9285 return m_8[0]; 9286 #endif 9287 } 9288 9289 OIIO_FORCEINLINE vfloat8 vfloat16::hi () const { 9290 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512DQ_ENABLED 9291 return _mm512_extractf32x8_ps (simd(), 1); 9292 #else 9293 return m_8[1]; 9294 #endif 9295 } 9296 9297 9298 OIIO_FORCEINLINE vfloat16::vfloat16 (float v0, float v1, float v2, float v3, 9299 float v4, float v5, float v6, float v7, 9300 float v8, float v9, float v10, float v11, 9301 float v12, float v13, float v14, float v15) { 9302 load (v0, v1, v2, v3, v4, v5, v6, v7, 9303 v8, v9, v10, v11, v12, v13, v14, v15); 9304 } 9305 9306 OIIO_FORCEINLINE vfloat16::vfloat16 (const vfloat8& lo, const vfloat8 &hi) { 9307 #if OIIO_SIMD_AVX >= 512 9308 __m512 r = _mm512_castps256_ps512 (lo); 9309 m_simd = _mm512_insertf32x8 (r, hi, 1); 9310 #else 9311 m_8[0] = lo; 9312 m_8[1] = hi; 9313 #endif 9314 } 9315 9316 OIIO_FORCEINLINE vfloat16::vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d) { 9317 #if OIIO_SIMD_AVX >= 512 9318 m_simd = _mm512_broadcast_f32x4(a); 9319 m_simd = _mm512_insertf32x4 (m_simd, b, 1); 9320 m_simd = _mm512_insertf32x4 (m_simd, c, 2); 9321 m_simd = _mm512_insertf32x4 (m_simd, d, 3); 9322 #else 9323 m_8[0] = vfloat8(a,b); 9324 m_8[1] = vfloat8(c,d); 9325 #endif 9326 } 9327 9328 9329 OIIO_FORCEINLINE vfloat16::vfloat16 (const vint16& ival) { 9330 #if OIIO_SIMD_AVX >= 512 9331 m_simd = _mm512_cvtepi32_ps (ival); 9332 #else 9333 SIMD_CONSTRUCT (float(ival[i])); 9334 #endif 9335 } 9336 9337 9338 OIIO_FORCEINLINE const vfloat16 vfloat16::Zero () { 9339 #if OIIO_SIMD_AVX >= 512 9340 return _mm512_setzero_ps(); 9341 #else 9342 return vfloat16(0.0f); 9343 #endif 9344 } 9345 9346 OIIO_FORCEINLINE const vfloat16 vfloat16::One () { 9347 return vfloat16(1.0f); 9348 } 9349 9350 OIIO_FORCEINLINE const vfloat16 vfloat16::Iota (float start, float step) { 9351 return vfloat16 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step, 9352 start+4.0f*step, start+5.0f*step, start+6.0f*step, start+7.0f*step, 9353 start+8.0f*step, start+9.0f*step, start+10.0f*step, start+11.0f*step, 9354 start+12.0f*step, start+13.0f*step, start+14.0f*step, start+15.0f*step); 9355 } 9356 9357 /// Set all components to 0.0 9358 OIIO_FORCEINLINE void vfloat16::clear () { 9359 #if OIIO_SIMD_AVX >= 512 9360 m_simd = _mm512_setzero_ps(); 9361 #else 9362 load (0.0f); 9363 #endif 9364 } 9365 9366 9367 OIIO_FORCEINLINE void vfloat16::load (float a) { 9368 #if OIIO_SIMD_AVX >= 512 9369 m_simd = _mm512_set1_ps (a); 9370 #else 9371 m_8[0].load (a); 9372 m_8[1].load (a); 9373 #endif 9374 } 9375 9376 9377 OIIO_FORCEINLINE void vfloat16::load (float v0, float v1, float v2, float v3, 9378 float v4, float v5, float v6, float v7, 9379 float v8, float v9, float v10, float v11, 9380 float v12, float v13, float v14, float v15) { 9381 #if OIIO_SIMD_AVX >= 512 9382 m_simd = _mm512_setr_ps (v0, v1, v2, v3, v4, v5, v6, v7, 9383 v8, v9, v10, v11, v12, v13, v14, v15); 9384 #else 9385 m_val[ 0] = v0; 9386 m_val[ 1] = v1; 9387 m_val[ 2] = v2; 9388 m_val[ 3] = v3; 9389 m_val[ 4] = v4; 9390 m_val[ 5] = v5; 9391 m_val[ 6] = v6; 9392 m_val[ 7] = v7; 9393 m_val[ 8] = v8; 9394 m_val[ 9] = v9; 9395 m_val[10] = v10; 9396 m_val[11] = v11; 9397 m_val[12] = v12; 9398 m_val[13] = v13; 9399 m_val[14] = v14; 9400 m_val[15] = v15; 9401 #endif 9402 } 9403 9404 9405 OIIO_FORCEINLINE void vfloat16::load (const float *values) { 9406 #if OIIO_SIMD_AVX >= 512 9407 m_simd = _mm512_loadu_ps (values); 9408 #else 9409 m_8[0].load (values); 9410 m_8[1].load (values+8); 9411 #endif 9412 } 9413 9414 9415 OIIO_FORCEINLINE void vfloat16::load (const float *values, int n) 9416 { 9417 OIIO_DASSERT (n >= 0 && n <= elements); 9418 #if OIIO_SIMD_AVX >= 512 9419 m_simd = _mm512_maskz_loadu_ps (__mmask16(~(0xffff << n)), values); 9420 #else 9421 if (n > 8) { 9422 m_8[0].load (values); 9423 m_8[1].load (values+8, n-8); 9424 } else { 9425 m_8[0].load (values, n); 9426 m_8[1].clear (); 9427 } 9428 #endif 9429 } 9430 9431 9432 OIIO_FORCEINLINE void vfloat16::load (const unsigned short *values) { 9433 #if OIIO_SIMD_AVX >= 512 9434 // Rely on the ushort->int conversion, then convert to float 9435 m_simd = _mm512_cvtepi32_ps (vint16(values).simd()); 9436 #else 9437 m_8[0].load (values); 9438 m_8[1].load (values+8); 9439 #endif 9440 } 9441 9442 9443 OIIO_FORCEINLINE void vfloat16::load (const short *values) { 9444 #if OIIO_SIMD_AVX >= 512 9445 // Rely on the short->int conversion, then convert to float 9446 m_simd = _mm512_cvtepi32_ps (vint16(values).simd()); 9447 #else 9448 m_8[0].load (values); 9449 m_8[1].load (values+8); 9450 #endif 9451 } 9452 9453 9454 OIIO_FORCEINLINE void vfloat16::load (const unsigned char *values) { 9455 #if OIIO_SIMD_AVX >= 512 9456 m_simd = _mm512_cvtepi32_ps (vint16(values).simd()); 9457 #else 9458 m_8[0].load (values); 9459 m_8[1].load (values+8); 9460 #endif 9461 } 9462 9463 9464 OIIO_FORCEINLINE void vfloat16::load (const char *values) { 9465 #if OIIO_SIMD_AVX >= 512 9466 m_simd = _mm512_cvtepi32_ps (vint16(values).simd()); 9467 #else 9468 m_8[0].load (values); 9469 m_8[1].load (values+8); 9470 #endif 9471 } 9472 9473 9474 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 9475 OIIO_FORCEINLINE void vfloat16::load (const half *values) { 9476 #if OIIO_SIMD_AVX >= 512 9477 /* Enabled 16 bit float instructions! */ 9478 vint8 a ((const int *)values); 9479 m_simd = _mm512_cvtph_ps (a); 9480 #else 9481 m_8[0].load (values); 9482 m_8[1].load (values+8); 9483 #endif 9484 } 9485 #endif /* _HALF_H_ or _IMATH_H_ */ 9486 9487 9488 9489 OIIO_FORCEINLINE void vfloat16::store (float *values) const { 9490 #if OIIO_SIMD_AVX >= 512 9491 // Use an unaligned store -- it's just as fast when the memory turns 9492 // out to be aligned, nearly as fast even when unaligned. Not worth 9493 // the headache of using stores that require alignment. 9494 _mm512_storeu_ps (values, m_simd); 9495 #else 9496 m_8[0].store (values); 9497 m_8[1].store (values+8); 9498 #endif 9499 } 9500 9501 9502 OIIO_FORCEINLINE void vfloat16::store (float *values, int n) const { 9503 OIIO_DASSERT (n >= 0 && n <= elements); 9504 // FIXME: is this faster with AVX masked stores? 9505 #if 0 && OIIO_SIMD_AVX >= 512 9506 // This SHOULD be fast, but in my benchmarks, it is slower! 9507 // (At least on the AVX512 hardware I have, Xeon Silver 4110.) 9508 // Re-test this periodically with new Intel hardware. 9509 _mm512_mask_storeu_ps (values, __mmask16(~(0xffff << n)), m_simd); 9510 #else 9511 if (n <= 8) { 9512 lo().store (values, n); 9513 } else if (n < 16) { 9514 lo().store (values); 9515 hi().store (values+8, n-8); 9516 } else { 9517 store (values); 9518 } 9519 #endif 9520 } 9521 9522 #if defined(_HALF_H_) || defined(IMATH_HALF_H_) 9523 OIIO_FORCEINLINE void vfloat16::store (half *values) const { 9524 #if OIIO_SIMD_AVX >= 512 9525 __m256i h = _mm512_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); 9526 _mm256_storeu_si256 ((__m256i *)values, h); 9527 #else 9528 m_8[0].store (values); 9529 m_8[1].store (values+8); 9530 #endif 9531 } 9532 #endif 9533 9534 9535 OIIO_FORCEINLINE void vfloat16::load_mask (const vbool16 &mask, const float *values) { 9536 #if OIIO_SIMD_AVX >= 512 9537 m_simd = _mm512_maskz_loadu_ps (mask, (const simd_t *)values); 9538 #else 9539 m_8[0].load_mask (mask.lo(), values); 9540 m_8[1].load_mask (mask.hi(), values+8); 9541 #endif 9542 } 9543 9544 9545 OIIO_FORCEINLINE void vfloat16::store_mask (const vbool16 &mask, float *values) const { 9546 #if OIIO_SIMD_AVX >= 512 9547 _mm512_mask_storeu_ps (values, mask.bitmask(), m_simd); 9548 #else 9549 lo().store_mask (mask.lo(), values); 9550 hi().store_mask (mask.hi(), values+8); 9551 #endif 9552 } 9553 9554 9555 9556 template <int scale> 9557 OIIO_FORCEINLINE void 9558 vfloat16::gather (const value_t *baseptr, const vint_t& vindex) 9559 { 9560 #if OIIO_SIMD_AVX >= 512 9561 m_simd = _mm512_i32gather_ps (vindex, baseptr, scale); 9562 #else 9563 m_8[0].gather<scale> (baseptr, vindex.lo()); 9564 m_8[1].gather<scale> (baseptr, vindex.hi()); 9565 #endif 9566 } 9567 9568 template<int scale> 9569 OIIO_FORCEINLINE void 9570 vfloat16::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex) 9571 { 9572 #if OIIO_SIMD_AVX >= 512 9573 m_simd = _mm512_mask_i32gather_ps (m_simd, mask, vindex, baseptr, scale); 9574 #else 9575 m_8[0].gather_mask<scale> (mask.lo(), baseptr, vindex.lo()); 9576 m_8[1].gather_mask<scale> (mask.hi(), baseptr, vindex.hi()); 9577 #endif 9578 } 9579 9580 template<int scale> 9581 OIIO_FORCEINLINE void 9582 vfloat16::scatter (value_t *baseptr, const vint_t& vindex) const 9583 { 9584 #if OIIO_SIMD_AVX >= 512 9585 _mm512_i32scatter_ps (baseptr, vindex, m_simd, scale); 9586 #else 9587 lo().scatter<scale> (baseptr, vindex.lo()); 9588 hi().scatter<scale> (baseptr, vindex.hi()); 9589 #endif 9590 } 9591 9592 template<int scale> 9593 OIIO_FORCEINLINE void 9594 vfloat16::scatter_mask (const bool_t& mask, value_t *baseptr, 9595 const vint_t& vindex) const 9596 { 9597 #if OIIO_SIMD_AVX >= 512 9598 _mm512_mask_i32scatter_ps (baseptr, mask, vindex, m_simd, scale); 9599 #else 9600 lo().scatter_mask<scale> (mask.lo(), baseptr, vindex.lo()); 9601 hi().scatter_mask<scale> (mask.hi(), baseptr, vindex.hi()); 9602 #endif 9603 } 9604 9605 9606 9607 OIIO_FORCEINLINE vfloat16 operator+ (const vfloat16& a, const vfloat16& b) { 9608 #if OIIO_SIMD_AVX >= 512 9609 return _mm512_add_ps (a.m_simd, b.m_simd); 9610 #else 9611 return vfloat16 (a.lo()+b.lo(), a.hi()+b.hi()); 9612 #endif 9613 } 9614 9615 OIIO_FORCEINLINE const vfloat16 & operator+= (vfloat16& a, const vfloat16& b) { 9616 return a = a + b; 9617 } 9618 9619 OIIO_FORCEINLINE vfloat16 operator- (const vfloat16& a) { 9620 #if OIIO_SIMD_AVX >= 512 9621 return _mm512_sub_ps (_mm512_setzero_ps(), a.simd()); 9622 #else 9623 return vfloat16 (-a.lo(), -a.hi()); 9624 #endif 9625 } 9626 9627 OIIO_FORCEINLINE vfloat16 operator- (const vfloat16& a, const vfloat16& b) { 9628 #if OIIO_SIMD_AVX >= 512 9629 return _mm512_sub_ps (a.m_simd, b.m_simd); 9630 #else 9631 return vfloat16 (a.lo()-b.lo(), a.hi()-b.hi()); 9632 #endif 9633 } 9634 9635 OIIO_FORCEINLINE const vfloat16 & operator-= (vfloat16& a, const vfloat16& b) { 9636 return a = a - b; 9637 } 9638 9639 9640 OIIO_FORCEINLINE vfloat16 operator* (const vfloat16& a, float b) { 9641 #if OIIO_SIMD_AVX >= 512 9642 return _mm512_mul_ps (a.m_simd, _mm512_set1_ps(b)); 9643 #else 9644 return vfloat16 (a.lo()*b, a.hi()*b); 9645 #endif 9646 } 9647 9648 OIIO_FORCEINLINE vfloat16 operator* (float a, const vfloat16& b) { 9649 return b * a; 9650 } 9651 9652 OIIO_FORCEINLINE vfloat16 operator* (const vfloat16& a, const vfloat16& b) { 9653 #if OIIO_SIMD_AVX >= 512 9654 return _mm512_mul_ps (a.m_simd, b.m_simd); 9655 #else 9656 return vfloat16 (a.lo()*b.lo(), a.hi()*b.hi()); 9657 #endif 9658 } 9659 9660 OIIO_FORCEINLINE const vfloat16 & operator*= (vfloat16& a, const vfloat16& b) { 9661 return a = a * b; 9662 } 9663 9664 OIIO_FORCEINLINE vfloat16 operator/ (const vfloat16& a, const vfloat16& b) { 9665 #if OIIO_SIMD_AVX >= 512 9666 return _mm512_div_ps (a.m_simd, b.m_simd); 9667 #else 9668 return vfloat16 (a.lo()/b.lo(), a.hi()/b.hi()); 9669 #endif 9670 } 9671 9672 OIIO_FORCEINLINE const vfloat16 & operator/= (vfloat16& a, const vfloat16& b) { 9673 return a = a / b; 9674 } 9675 9676 9677 OIIO_FORCEINLINE vbool16 operator== (const vfloat16& a, const vfloat16& b) { 9678 #if OIIO_SIMD_AVX >= 512 9679 return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_EQ_OQ); 9680 #else /* Fall back to 8-wide */ 9681 return vbool16 (a.lo() == b.lo(), a.hi() == b.hi()); 9682 #endif 9683 } 9684 9685 9686 OIIO_FORCEINLINE vbool16 operator!= (const vfloat16& a, const vfloat16& b) { 9687 #if OIIO_SIMD_AVX >= 512 9688 return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_NEQ_OQ); 9689 #else /* Fall back to 8-wide */ 9690 return vbool16 (a.lo() != b.lo(), a.hi() != b.hi()); 9691 #endif 9692 } 9693 9694 9695 OIIO_FORCEINLINE vbool16 operator< (const vfloat16& a, const vfloat16& b) { 9696 #if OIIO_SIMD_AVX >= 512 9697 return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_LT_OQ); 9698 #else /* Fall back to 8-wide */ 9699 return vbool16 (a.lo() < b.lo(), a.hi() < b.hi()); 9700 #endif 9701 } 9702 9703 9704 OIIO_FORCEINLINE vbool16 operator> (const vfloat16& a, const vfloat16& b) { 9705 #if OIIO_SIMD_AVX >= 512 9706 return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_GT_OQ); 9707 #else /* Fall back to 8-wide */ 9708 return vbool16 (a.lo() > b.lo(), a.hi() > b.hi()); 9709 #endif 9710 } 9711 9712 9713 OIIO_FORCEINLINE vbool16 operator>= (const vfloat16& a, const vfloat16& b) { 9714 #if OIIO_SIMD_AVX >= 512 9715 return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_GE_OQ); 9716 #else /* Fall back to 8-wide */ 9717 return vbool16 (a.lo() >= b.lo(), a.hi() >= b.hi()); 9718 #endif 9719 } 9720 9721 9722 OIIO_FORCEINLINE vbool16 operator<= (const vfloat16& a, const vfloat16& b) { 9723 #if OIIO_SIMD_AVX >= 512 9724 return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_LE_OQ); 9725 #else /* Fall back to 8-wide */ 9726 return vbool16 (a.lo() <= b.lo(), a.hi() <= b.hi()); 9727 #endif 9728 } 9729 9730 9731 // Implementation had to be after the definition of vfloat16. 9732 OIIO_FORCEINLINE vint16::vint16 (const vfloat16& f) 9733 { 9734 #if OIIO_SIMD_AVX >= 512 9735 m_simd = _mm512_cvttps_epi32(f); 9736 #else 9737 *this = vint16 (vint8(f.lo()), vint8(f.hi())); 9738 #endif 9739 } 9740 9741 9742 9743 // Shuffle groups of 4 9744 template<int i0, int i1, int i2, int i3> 9745 vfloat16 shuffle4 (const vfloat16& a) { 9746 #if OIIO_SIMD_AVX >= 512 9747 return _mm512_shuffle_f32x4(a,a,_MM_SHUFFLE(i3,i2,i1,i0)); 9748 #else 9749 vfloat4 x[4]; 9750 a.store ((float *)x); 9751 return vfloat16 (x[i0], x[i1], x[i2], x[i3]); 9752 #endif 9753 } 9754 9755 template<int i> vfloat16 shuffle4 (const vfloat16& a) { 9756 return shuffle4<i,i,i,i> (a); 9757 } 9758 9759 template<int i0, int i1, int i2, int i3> 9760 vfloat16 shuffle (const vfloat16& a) { 9761 #if OIIO_SIMD_AVX >= 512 9762 return _mm512_permute_ps(a,_MM_SHUFFLE(i3,i2,i1,i0)); 9763 #else 9764 vfloat4 x[4]; 9765 a.store ((float *)x); 9766 return vfloat16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]), 9767 shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3])); 9768 #endif 9769 } 9770 9771 template<int i> vfloat16 shuffle (const vfloat16& a) { 9772 return shuffle<i,i,i,i> (a); 9773 } 9774 9775 9776 template<int i> 9777 OIIO_FORCEINLINE float extract (const vfloat16& a) { 9778 return a[i]; 9779 } 9780 9781 9782 template<int i> 9783 OIIO_FORCEINLINE vfloat16 insert (const vfloat16& a, float val) { 9784 vfloat16 tmp = a; 9785 tmp[i] = val; 9786 return tmp; 9787 } 9788 9789 9790 OIIO_FORCEINLINE float vfloat16::x () const { 9791 #if OIIO_SIMD_AVX >= 512 9792 return _mm_cvtss_f32(_mm512_castps512_ps128(m_simd)); 9793 #else 9794 return m_val[0]; 9795 #endif 9796 } 9797 9798 OIIO_FORCEINLINE float vfloat16::y () const { return m_val[1]; } 9799 OIIO_FORCEINLINE float vfloat16::z () const { return m_val[2]; } 9800 OIIO_FORCEINLINE float vfloat16::w () const { return m_val[3]; } 9801 OIIO_FORCEINLINE void vfloat16::set_x (float val) { m_val[0] = val; } 9802 OIIO_FORCEINLINE void vfloat16::set_y (float val) { m_val[1] = val; } 9803 OIIO_FORCEINLINE void vfloat16::set_z (float val) { m_val[2] = val; } 9804 OIIO_FORCEINLINE void vfloat16::set_w (float val) { m_val[3] = val; } 9805 9806 9807 OIIO_FORCEINLINE vint16 bitcast_to_int (const vfloat16& x) 9808 { 9809 #if OIIO_SIMD_AVX >= 512 9810 return _mm512_castps_si512 (x.simd()); 9811 #else 9812 return *(vint16 *)&x; 9813 #endif 9814 } 9815 9816 OIIO_FORCEINLINE vfloat16 bitcast_to_float (const vint16& x) 9817 { 9818 #if OIIO_SIMD_AVX >= 512 9819 return _mm512_castsi512_ps (x.simd()); 9820 #else 9821 return *(vfloat16 *)&x; 9822 #endif 9823 } 9824 9825 9826 OIIO_FORCEINLINE vfloat16 vreduce_add (const vfloat16& v) { 9827 #if OIIO_SIMD_AVX >= 512 9828 // Nomenclature: ABCD are the vint4's comprising v 9829 // First, add the vint4's and make them all the same 9830 vfloat16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed 9831 vfloat16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD); 9832 // Now, add within each vint4 9833 vfloat16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(w); // each adjacent int is summed 9834 return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd); 9835 #else 9836 vfloat8 sum = vreduce_add(v.lo()) + vreduce_add(v.hi()); 9837 return vfloat16 (sum, sum); 9838 #endif 9839 } 9840 9841 9842 OIIO_FORCEINLINE float reduce_add (const vfloat16& v) { 9843 #if OIIO_SIMD_AVX >= 512 9844 return vreduce_add(v).x(); 9845 #else 9846 return reduce_add(v.lo()) + reduce_add(v.hi()); 9847 #endif 9848 } 9849 9850 9851 OIIO_FORCEINLINE vfloat16 blend (const vfloat16& a, const vfloat16& b, const vbool16& mask) 9852 { 9853 #if OIIO_SIMD_AVX >= 512 9854 return _mm512_mask_blend_ps (mask, a, b); 9855 #else 9856 return vfloat16 (blend (a.lo(), b.lo(), mask.lo()), 9857 blend (a.hi(), b.hi(), mask.hi())); 9858 #endif 9859 } 9860 9861 9862 OIIO_FORCEINLINE vfloat16 blend0 (const vfloat16& a, const vbool16& mask) 9863 { 9864 #if OIIO_SIMD_AVX >= 512 9865 return _mm512_maskz_mov_ps (mask, a); 9866 #else 9867 return vfloat16 (blend0 (a.lo(), mask.lo()), 9868 blend0 (a.hi(), mask.hi())); 9869 #endif 9870 } 9871 9872 9873 OIIO_FORCEINLINE vfloat16 blend0not (const vfloat16& a, const vbool16& mask) 9874 { 9875 #if OIIO_SIMD_AVX >= 512 9876 return _mm512_maskz_mov_ps (!mask, a); 9877 #else 9878 return vfloat16 (blend0not (a.lo(), mask.lo()), 9879 blend0not (a.hi(), mask.hi())); 9880 #endif 9881 } 9882 9883 9884 OIIO_FORCEINLINE vfloat16 select (const vbool16& mask, const vfloat16& a, const vfloat16& b) 9885 { 9886 return blend (b, a, mask); 9887 } 9888 9889 9890 OIIO_FORCEINLINE vfloat16 safe_div (const vfloat16 &a, const vfloat16 &b) { 9891 #if OIIO_SIMD_SSE 9892 return blend0not (a/b, b == vfloat16::Zero()); 9893 #else 9894 SIMD_RETURN (vfloat16, b[i] == 0.0f ? 0.0f : a[i] / b[i]); 9895 #endif 9896 } 9897 9898 9899 OIIO_FORCEINLINE vfloat16 abs (const vfloat16& a) 9900 { 9901 #if OIIO_SIMD_AVX >= 512 9902 // Not available? return _mm512_abs_ps (a.simd()); 9903 // Just clear the sign bit for cheap fabsf 9904 return _mm512_castsi512_ps (_mm512_and_epi32 (_mm512_castps_si512(a.simd()), 9905 _mm512_set1_epi32(0x7fffffff))); 9906 #else 9907 return vfloat16(abs(a.lo()), abs(a.hi())); 9908 #endif 9909 } 9910 9911 9912 OIIO_FORCEINLINE vfloat16 sign (const vfloat16& a) 9913 { 9914 vfloat16 one(1.0f); 9915 return blend (one, -one, a < vfloat16::Zero()); 9916 } 9917 9918 9919 OIIO_FORCEINLINE vfloat16 ceil (const vfloat16& a) 9920 { 9921 #if OIIO_SIMD_AVX >= 512 9922 return _mm512_ceil_ps (a); 9923 #else 9924 return vfloat16(ceil(a.lo()), ceil(a.hi())); 9925 #endif 9926 } 9927 9928 OIIO_FORCEINLINE vfloat16 floor (const vfloat16& a) 9929 { 9930 #if OIIO_SIMD_AVX >= 512 9931 return _mm512_floor_ps (a); 9932 #else 9933 return vfloat16(floor(a.lo()), floor(a.hi())); 9934 #endif 9935 } 9936 9937 9938 OIIO_FORCEINLINE vfloat16 round (const vfloat16& a) 9939 { 9940 #if OIIO_SIMD_AVX >= 512 9941 return _mm512_roundscale_ps (a, (1<<4) | 3); // scale=1, round to nearest smaller mag int 9942 #else 9943 return vfloat16(round(a.lo()), round(a.hi())); 9944 #endif 9945 } 9946 9947 OIIO_FORCEINLINE vint16 ifloor (const vfloat16& a) 9948 { 9949 #if OIIO_SIMD_AVX >= 512 9950 return _mm512_cvt_roundps_epi32 (a, (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)); 9951 #else 9952 return vint16(floor(a)); 9953 #endif 9954 } 9955 9956 9957 OIIO_FORCEINLINE vint16 rint (const vfloat16& a) 9958 { 9959 return vint16(round(a)); 9960 } 9961 9962 9963 OIIO_FORCEINLINE vfloat16 rcp_fast (const vfloat16 &a) 9964 { 9965 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED 9966 return _mm512_rcp28_ps(a); 9967 #elif OIIO_SIMD_AVX >= 512 9968 vfloat16 r = _mm512_rcp14_ps(a); 9969 return r * nmadd (r, a, vfloat16(2.0f)); 9970 #else 9971 return vfloat16(rcp_fast(a.lo()), rcp_fast(a.hi())); 9972 #endif 9973 } 9974 9975 9976 OIIO_FORCEINLINE vfloat16 sqrt (const vfloat16 &a) 9977 { 9978 #if OIIO_SIMD_AVX >= 512 9979 return _mm512_sqrt_ps (a); 9980 #else 9981 return vfloat16(sqrt(a.lo()), sqrt(a.hi())); 9982 #endif 9983 } 9984 9985 9986 OIIO_FORCEINLINE vfloat16 rsqrt (const vfloat16 &a) 9987 { 9988 #if OIIO_SIMD_AVX >= 512 9989 return _mm512_div_ps (_mm512_set1_ps(1.0f), _mm512_sqrt_ps (a)); 9990 #else 9991 return vfloat16(rsqrt(a.lo()), rsqrt(a.hi())); 9992 #endif 9993 } 9994 9995 9996 OIIO_FORCEINLINE vfloat16 rsqrt_fast (const vfloat16 &a) 9997 { 9998 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED 9999 return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC); 10000 #elif OIIO_SIMD_AVX >= 512 10001 return _mm512_rsqrt14_ps (a); 10002 #else 10003 return vfloat16(rsqrt_fast(a.lo()), rsqrt_fast(a.hi())); 10004 #endif 10005 } 10006 10007 10008 OIIO_FORCEINLINE vfloat16 min (const vfloat16& a, const vfloat16& b) 10009 { 10010 #if OIIO_SIMD_AVX >= 512 10011 return _mm512_min_ps (a, b); 10012 #else 10013 return vfloat16(min(a.lo(),b.lo()), min(a.hi(),b.hi())); 10014 #endif 10015 } 10016 10017 OIIO_FORCEINLINE vfloat16 max (const vfloat16& a, const vfloat16& b) 10018 { 10019 #if OIIO_SIMD_AVX >= 512 10020 return _mm512_max_ps (a, b); 10021 #else 10022 return vfloat16(max(a.lo(),b.lo()), max(a.hi(),b.hi())); 10023 #endif 10024 } 10025 10026 10027 OIIO_FORCEINLINE vfloat16 andnot (const vfloat16& a, const vfloat16& b) { 10028 #if OIIO_SIMD_AVX >= 512 && defined(__AVX512DQ__) 10029 return _mm512_andnot_ps (a, b); 10030 #else 10031 return vfloat16(andnot(a.lo(),b.lo()), andnot(a.hi(),b.hi())); 10032 #endif 10033 } 10034 10035 10036 OIIO_FORCEINLINE vfloat16 madd (const simd::vfloat16& a, const simd::vfloat16& b, 10037 const simd::vfloat16& c) 10038 { 10039 #if OIIO_SIMD_AVX >= 512 10040 return _mm512_fmadd_ps (a, b, c); 10041 #else 10042 return vfloat16 (madd(a.lo(), b.lo(), c.lo()), 10043 madd(a.hi(), b.hi(), c.hi())); 10044 #endif 10045 } 10046 10047 10048 OIIO_FORCEINLINE vfloat16 msub (const simd::vfloat16& a, const simd::vfloat16& b, 10049 const simd::vfloat16& c) 10050 { 10051 #if OIIO_SIMD_AVX >= 512 10052 return _mm512_fmsub_ps (a, b, c); 10053 #else 10054 return vfloat16 (msub(a.lo(), b.lo(), c.lo()), 10055 msub(a.hi(), b.hi(), c.hi())); 10056 #endif 10057 } 10058 10059 10060 10061 OIIO_FORCEINLINE vfloat16 nmadd (const simd::vfloat16& a, const simd::vfloat16& b, 10062 const simd::vfloat16& c) 10063 { 10064 #if OIIO_SIMD_AVX >= 512 10065 return _mm512_fnmadd_ps (a, b, c); 10066 #else 10067 return vfloat16 (nmadd(a.lo(), b.lo(), c.lo()), 10068 nmadd(a.hi(), b.hi(), c.hi())); 10069 #endif 10070 } 10071 10072 10073 10074 OIIO_FORCEINLINE vfloat16 nmsub (const simd::vfloat16& a, const simd::vfloat16& b, 10075 const simd::vfloat16& c) 10076 { 10077 #if OIIO_SIMD_AVX >= 512 10078 return _mm512_fnmsub_ps (a, b, c); 10079 #else 10080 return vfloat16 (nmsub(a.lo(), b.lo(), c.lo()), 10081 nmsub(a.hi(), b.hi(), c.hi())); 10082 #endif 10083 } 10084 10085 10086 10087 10088 } // end namespace simd 10089 10090 OIIO_NAMESPACE_END 10091 10092 10093 #undef SIMD_DO 10094 #undef SIMD_CONSTRUCT 10095 #undef SIMD_CONSTRUCT_PAD 10096 #undef SIMD_RETURN 10097 #undef SIMD_RETURN_REDUCE 10098