1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio/blob/master/LICENSE.md
4 
5 /// @file  simd.h
6 ///
7 /// @brief Classes for SIMD processing.
8 ///
9 /// Nice references for all the Intel intrinsics (SSE*, AVX*, etc.):
10 ///   https://software.intel.com/sites/landingpage/IntrinsicsGuide/
11 ///
12 /// Similar guide for ARM intrinsics:
13 ///   https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
14 ///
15 /// It helped me a lot to peruse the source of these packages:
16 ///   Syrah:     https://github.com/boulos/syrah
17 ///   Embree:    https://github.com/embree
18 ///   Vectorial: https://github.com/scoopr/vectorial
19 ///
20 /// To find out which CPU features you have:
21 ///   Linux: cat /proc/cpuinfo
22 ///   OSX:   sysctl machdep.cpu.features
23 ///
24 /// Additional web resources:
25 ///   http://www.codersnotes.com/notes/maths-lib-2016/
26 
27 // clang-format off
28 
29 #pragma once
30 
31 #include <algorithm>
32 #include <cstring>
33 
34 #include <OpenImageIO/Imath.h>
35 #include <OpenImageIO/dassert.h>
36 #include <OpenImageIO/platform.h>
37 
38 
39 //////////////////////////////////////////////////////////////////////////
40 // Sort out which SIMD capabilities we have and set definitions
41 // appropriately. This is mostly for internal (within this file) use,
42 // but client applications using this header may find a few of the macros
43 // we define to be useful:
44 //
45 // OIIO_SIMD : Will be 0 if no hardware SIMD support is specified. If SIMD
46 //             hardware is available, this will hold the width in number of
47 //             float SIMD "lanes" of widest SIMD registers available. For
48 //             example, OIIO_SIMD will be 4 if vfloat4/vint4/vbool4 are
49 //             hardware accelerated, 8 if vfloat8/vint8/vbool8 are accelerated,
50 //             etc. Using SIMD classes wider than this should work (will be
51 //             emulated with narrower SIMD or scalar operations), but is not
52 //             expected to have high performance.
53 // OIIO_SIMD_SSE : if Intel SSE is supported, this will be nonzero,
54 //             specifically 2 for SSE2, 3 for SSSE3, 4 for SSE4.1 or
55 //             higher (including AVX).
56 // OIIO_SIMD_AVX : If Intel AVX is supported, this will be nonzero, and
57 //             specifically 1 for AVX (1.0), 2 for AVX2, 512 for AVX512f.
58 // OIIO_SIMD_NEON : If ARM NEON is supported, this will be nonzero.
59 // OIIO_SIMD_MAX_SIZE : holds the width in bytes of the widest SIMD
60 //             available (generally will be OIIO_SIMD*4).
61 // OIIO_SIMD4_ALIGN : macro for best alignment of 4-wide SIMD values in mem.
62 // OIIO_SIMD8_ALIGN : macro for best alignment of 8-wide SIMD values in mem.
63 // OIIO_SIMD16_ALIGN : macro for best alignment of 16-wide SIMD values in mem.
64 // OIIO_SIMD_HAS_MATRIX4 : nonzero if matrix44 is defined
65 // OIIO_SIMD_HAS_SIMD8 : nonzero if vfloat8, vint8, vbool8 are defined
66 // OIIO_SIMD_HAS_SIMD16 : nonzero if vfloat16, vint16, vbool16 are defined
67 
68 #if defined(_WIN32)
69 #  include <intrin.h>
70 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || defined(__e2k__)
71 #  include <x86intrin.h>
72 #elif defined(__GNUC__) && defined(__ARM_NEON__)
73 #  include <arm_neon.h>
74 #endif
75 
76 // Disable SSE for 32 bit Windows patforms, it's unreliable and hard for us
77 // to test thoroughly. We presume that anybody needing high performance
78 // badly enough to want SIMD also is on a 64 bit CPU.
79 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE)
80 #define OIIO_NO_SSE 1
81 #endif
82 
83 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE)
84 #  if (defined(__SSE4_1__) || defined(__SSE4_2__))
85 #    define OIIO_SIMD_SSE 4
86       /* N.B. We consider both SSE4.1 and SSE4.2 to be "4". There are a few
87        * instructions specific to 4.2, but they are all related to string
88        * comparisons and CRCs, which don't currently seem relevant to OIIO,
89        * so for simplicity, we sweep this difference under the rug.
90        */
91 #  elif defined(__SSSE3__)
92 #    define OIIO_SIMD_SSE 3
93      /* N.B. We only use OIIO_SIMD_SSE = 3 when fully at SSSE3. In theory,
94       * there are a few older architectures that are SSE3 but not SSSE3,
95       * and this simplification means that these particular old platforms
96       * will only get SSE2 goodness out of our code. So be it. Anybody who
97       * cares about performance is probably using a 64 bit machine that's
98       * SSE 4.x or AVX by now.
99       */
100 #  else
101 #    define OIIO_SIMD_SSE 2
102 #  endif
103 #  define OIIO_SIMD 4
104 #  define OIIO_SIMD_MAX_SIZE_BYTES 16
105 #  define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
106 #  define OIIO_SSE_ALIGN OIIO_ALIGN(16)
107 #else
108 #  define OIIO_SIMD_SSE 0
109 #endif
110 
111 #if defined(__AVX__) && !defined(OIIO_NO_AVX)
112    // N.B. Any machine with AVX will also have SSE
113 #  if defined(__AVX2__) && !defined(OIIO_NO_AVX2)
114 #    define OIIO_SIMD_AVX 2
115 #  else
116 #    define OIIO_SIMD_AVX 1
117 #  endif
118 #  undef OIIO_SIMD
119 #  define OIIO_SIMD 8
120 #  undef OIIO_SIMD_MAX_SIZE_BYTES
121 #  define OIIO_SIMD_MAX_SIZE_BYTES 32
122 #  define OIIO_SIMD8_ALIGN OIIO_ALIGN(32)
123 #  define OIIO_AVX_ALIGN OIIO_ALIGN(32)
124 #  if defined(__AVX512F__)
125 #    undef OIIO_SIMD_AVX
126 #    define OIIO_SIMD_AVX 512
127 #    undef OIIO_SIMD_MAX_SIZE_BYTES
128 #    define OIIO_SIMD_MAX_SIZE_BYTES 64
129 #    undef OIIO_SIMD
130 #    define OIIO_SIMD 16
131 #    define OIIO_SIMD16_ALIGN OIIO_ALIGN(64)
132 #    define OIIO_AVX512_ALIGN OIIO_ALIGN(64)
133 #    define OIIO_AVX512F_ENABLED 1
134 #  endif
135 #  if defined(__AVX512DQ__)
136 #    define OIIO_AVX512DQ_ENABLED 1   /* Doubleword and quadword */
137 #  else
138 #    define OIIO_AVX512DQ_ENABLED 0
139 #  endif
140 #  if defined(__AVX512PF__)
141 #    define OIIO_AVX512PF_ENABLED 1   /* Prefetch */
142 #  else
143 #    define OIIO_AVX512PF_ENABLED 0
144 #  endif
145 #  if defined(__AVX512ER__)
146 #    define OIIO_AVX512ER_ENABLED 1   /* Exponential & reciprocal */
147 #  else
148 #    define OIIO_AVX512ER_ENABLED 0
149 #  endif
150 #  if defined(__AVX512CD__)
151 #    define OIIO_AVX512CD_ENABLED 1   /* Conflict detection */
152 #  else
153 #    define OIIO_AVX512CD_ENABLED 0
154 #  endif
155 #  if defined(__AVX512BW__)
156 #    define OIIO_AVX512BW_ENABLED 1   /* Byte and word */
157 #  else
158 #    define OIIO_AVX512BW_ENABLED 0
159 #  endif
160 #  if defined(__AVX512VL__)
161 #    define OIIO_AVX512VL_ENABLED 1   /* Vector length extensions */
162 #  else
163 #    define OIIO_AVX512VL_ENABLED 0
164 #  endif
165 #else
166 #  define OIIO_SIMD_AVX 0
167 #  define OIIO_AVX512VL_ENABLED 0
168 #  define OIIO_AVX512DQ_ENABLED 0
169 #  define OIIO_AVX512PF_ENABLED 0
170 #  define OIIO_AVX512ER_ENABLED 0
171 #  define OIIO_AVX512CD_ENABLED 0
172 #  define OIIO_AVX512BW_ENABLED 0
173 #endif
174 
175 #if defined(__FMA__)
176 #  define OIIO_FMA_ENABLED 1
177 #else
178 #  define OIIO_FMA_ENABLED 0
179 #endif
180 #if defined(__AVX512IFMA__)
181 #  define OIIO_AVX512IFMA_ENABLED 1
182 #else
183 #  define OIIO_AVX512IFMA_ENABLED 0
184 #endif
185 
186 #if defined(__F16C__)
187 #  define OIIO_F16C_ENABLED 1
188 #else
189 #  define OIIO_F16C_ENABLED 0
190 #endif
191 
192 // FIXME Future: support ARM Neon
193 // Uncomment this when somebody with Neon can verify it works
194 #if defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
195 #  define OIIO_SIMD 4
196 #  define OIIO_SIMD_NEON 1
197 #  define OIIO_SIMD_MAX_SIZE_BYTES 16
198 #  define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
199 #  define OIIO_SSE_ALIGN OIIO_ALIGN(16)
200 #else
201 #  define OIIO_SIMD_NEON 0
202 #endif
203 
204 #ifndef OIIO_SIMD
205    // No SIMD available
206 #  define OIIO_SIMD 0
207 #  define OIIO_SIMD4_ALIGN
208 #  define OIIO_SIMD_MAX_SIZE_BYTES 16
209 #endif
210 
211 #ifndef OIIO_SIMD8_ALIGN
212 #  define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN
213 #endif
214 #ifndef OIIO_SIMD16_ALIGN
215 #  define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN
216 #endif
217 
218 
219 // General features that client apps may want to test for, for conditional
220 // compilation. Will add to this over time as needed. Note that just
221 // because a feature is present doesn't mean it's fast -- HAS_SIMD8 means
222 // the vfloat8 class (and friends) are in this version of simd.h, but that's
223 // different from OIIO_SIMD >= 8, which means it's supported in hardware.
224 #define OIIO_SIMD_HAS_MATRIX4 1  /* matrix44 defined */
225 #define OIIO_SIMD_HAS_FLOAT8 1   /* DEPRECATED(1.8) */
226 #define OIIO_SIMD_HAS_SIMD8 1    /* vfloat8, vint8, vbool8 defined */
227 #define OIIO_SIMD_HAS_SIMD16 1   /* vfloat16, vint16, vbool16 defined */
228 
229 
230 // Embarrassing hack: Xlib.h #define's True and False!
231 #ifdef True
232 #    undef True
233 #endif
234 #ifdef False
235 #    undef False
236 #endif
237 
238 
239 
240 OIIO_NAMESPACE_BEGIN
241 
242 namespace simd {
243 
244 //////////////////////////////////////////////////////////////////////////
245 // Forward declarations of our main SIMD classes
246 
247 class vbool4;
248 class vint4;
249 class vfloat4;
250 class vfloat3;
251 class matrix44;
252 class vbool8;
253 class vint8;
254 class vfloat8;
255 class vbool16;
256 class vint16;
257 class vfloat16;
258 
259 // Deprecated names -- remove these in 1.9
260 typedef vbool4 mask4;    // old name
261 typedef vbool4 bool4;
262 typedef vbool8 bool8;
263 typedef vint4 int4;
264 typedef vint8 int8;
265 typedef vfloat3 float3;
266 typedef vfloat4 float4;
267 typedef vfloat8 float8;
268 
269 
270 
271 //////////////////////////////////////////////////////////////////////////
272 // Template magic to determine the raw SIMD types involved, and other
273 // things helpful for metaprogramming.
274 
275 template <typename T, int N> struct simd_raw_t { struct type { T val[N]; }; };
276 template <int N> struct simd_bool_t { struct type { int val[N]; }; };
277 
278 #if OIIO_SIMD_SSE
279 template<> struct simd_raw_t<int,4> { typedef __m128i type; };
280 template<> struct simd_raw_t<float,4> { typedef __m128 type; };
281 template<> struct simd_bool_t<4> { typedef __m128 type; };
282 #endif
283 
284 #if OIIO_SIMD_AVX
285 template<> struct simd_raw_t<int,8> { typedef __m256i type; };
286 template<> struct simd_raw_t<float,8> { typedef __m256 type; };
287 template<> struct simd_bool_t<8> { typedef __m256 type; };
288 #endif
289 
290 #if OIIO_SIMD_AVX >= 512
291 template<> struct simd_raw_t<int,16> { typedef __m512i type; };
292 template<> struct simd_raw_t<float,16> { typedef __m512 type; };
293 template<> struct simd_bool_t<16> { typedef __mmask16 type; };
294 #else
295 // Note: change in strategy for 16-wide SIMD: instead of int[16] for
296 // vbool16, it's just a plain old bitmask, and __mask16 for actual HW.
297 template<> struct simd_bool_t<16> { typedef uint16_t type; };
298 #endif
299 
300 #if OIIO_SIMD_NEON
301 template<> struct simd_raw_t<int,4> { typedef int32x4_t type; };
302 template<> struct simd_raw_t<float,4> { typedef float32x4_t type; };
303 template<> struct simd_bool_t<4> { typedef uint32x4_t type; };
304 #endif
305 
306 
307 /// Template to retrieve the vector type from the scalar. For example,
308 /// simd::VecType<int,4> will be vfloat4.
309 template<typename T,int elements> struct VecType {};
310 template<> struct VecType<int,1>   { typedef int type; };
311 template<> struct VecType<float,1> { typedef float type; };
312 template<> struct VecType<int,4>   { typedef vint4 type; };
313 template<> struct VecType<float,4>   { typedef vfloat4 type; };
314 template<> struct VecType<float,3> { typedef vfloat3 type; };
315 template<> struct VecType<bool,4>  { typedef vbool4 type; };
316 template<> struct VecType<int,8>   { typedef vint8 type; };
317 template<> struct VecType<float,8> { typedef vfloat8 type; };
318 template<> struct VecType<bool,8>  { typedef vbool8 type; };
319 template<> struct VecType<int,16>   { typedef vint16 type; };
320 template<> struct VecType<float,16> { typedef vfloat16 type; };
321 template<> struct VecType<bool,16>  { typedef vbool16 type; };
322 
323 /// Template to retrieve the SIMD size of a SIMD type. Rigged to be 1 for
324 /// anything but our SIMD types.
325 template<typename T> struct SimdSize { static const int size = 1; };
326 template<> struct SimdSize<vint4>     { static const int size = 4; };
327 template<> struct SimdSize<vfloat4>   { static const int size = 4; };
328 template<> struct SimdSize<vfloat3>   { static const int size = 4; };
329 template<> struct SimdSize<vbool4>    { static const int size = 4; };
330 template<> struct SimdSize<vint8>     { static const int size = 8; };
331 template<> struct SimdSize<vfloat8>   { static const int size = 8; };
332 template<> struct SimdSize<vbool8>    { static const int size = 8; };
333 template<> struct SimdSize<vint16>    { static const int size = 16; };
334 template<> struct SimdSize<vfloat16>  { static const int size = 16; };
335 template<> struct SimdSize<vbool16>   { static const int size = 16; };
336 
337 /// Template to retrieve the number of elements size of a SIMD type. Rigged
338 /// to be 1 for anything but our SIMD types.
339 template<typename T> struct SimdElements { static const int size = SimdSize<T>::size; };
340 template<> struct SimdElements<vfloat3>   { static const int size = 3; };
341 
342 /// Template giving a printable name for each type
343 template<typename T> struct SimdTypeName { static const char *name() { return "unknown"; } };
344 template<> struct SimdTypeName<vfloat4>   { static const char *name() { return "vfloat4"; } };
345 template<> struct SimdTypeName<vint4>     { static const char *name() { return "vint4"; } };
346 template<> struct SimdTypeName<vbool4>    { static const char *name() { return "vbool4"; } };
347 template<> struct SimdTypeName<vfloat8>   { static const char *name() { return "vfloat8"; } };
348 template<> struct SimdTypeName<vint8>     { static const char *name() { return "vint8"; } };
349 template<> struct SimdTypeName<vbool8>    { static const char *name() { return "vbool8"; } };
350 template<> struct SimdTypeName<vfloat16>  { static const char *name() { return "vfloat16"; } };
351 template<> struct SimdTypeName<vint16>    { static const char *name() { return "vint16"; } };
352 template<> struct SimdTypeName<vbool16>   { static const char *name() { return "vbool16"; } };
353 
354 
355 //////////////////////////////////////////////////////////////////////////
356 // Macros helpful for making static constants in code.
357 
358 # define OIIO_SIMD_FLOAT4_CONST(name,val) \
359     static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) }
360 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \
361     static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) }
362 # define OIIO_SIMD_INT4_CONST(name,val) \
363     static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) }
364 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \
365     static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) }
366 # define OIIO_SIMD_UINT4_CONST(name,val) \
367     static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) }
368 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \
369     static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) }
370 
371 # define OIIO_SIMD_FLOAT8_CONST(name,val) \
372     static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \
373                                                     (val), (val), (val), (val) }
374 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
375     static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \
376                                                     (v4), (v5), (v6), (v7) }
377 # define OIIO_SIMD_INT8_CONST(name,val) \
378     static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \
379                                                   (val), (val), (val), (val) }
380 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
381     static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \
382                                                   (v4), (v5), (v6), (v7) }
383 # define OIIO_SIMD_UINT8_CONST(name,val) \
384     static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \
385                                                        (val), (val), (val), (val) }
386 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
387     static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \
388                                                        (v4), (v5), (v6), (v7) }
389 
390 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \
391     static const OIIO_SIMD16_ALIGN float name[16] = {           \
392         (val), (val), (val), (val), (val), (val), (val), (val), \
393         (val), (val), (val), (val), (val), (val), (val), (val) }
394 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
395     static const OIIO_SIMD16_ALIGN float name[16] = {           \
396         (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7),         \
397         (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
398 # define OIIO_SIMD_INT16_CONST(name,val) \
399     static const OIIO_SIMD16_ALIGN int name[16] = {             \
400         (val), (val), (val), (val), (val), (val), (val), (val), \
401         (val), (val), (val), (val), (val), (val), (val), (val) }
402 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
403     static const OIIO_SIMD16_ALIGN int name[16] = { \
404         (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
405         (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
406 # define OIIO_SIMD_UINT16_CONST(name,val) \
407     static const OIIO_SIMD16_ALIGN uint32_t name[16] = {        \
408         (val), (val), (val), (val), (val), (val), (val), (val), \
409         (val), (val), (val), (val), (val), (val), (val), (val) }
410 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
411     static const OIIO_SIMD16_ALIGN uint32_t name[16] = {        \
412         (val), (val), (val), (val), (val), (val), (val), (val), \
413         (val), (val), (val), (val), (val), (val), (val), (val) }
414 
415 
416 //////////////////////////////////////////////////////////////////////////
417 // Some macros just for use in this file (#undef-ed at the end) making
418 // it more succinct to express per-element operations.
419 
420 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x
421 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x)
422 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \
423                               for (int i = elements; i < paddedelements; ++i) m_val[i] = 0
424 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r
425 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r
426 
427 
428 
429 //////////////////////////////////////////////////////////////////////////
430 //////////////////////////////////////////////////////////////////////////
431 // The public declarations of the main SIMD classes follow: boolN, intN,
432 // floatN, matrix44.
433 //
434 // These class declarations are intended to be brief and self-documenting,
435 // and give all the information that users or client applications need to
436 // know to use these classes.
437 //
438 // No implementations are given inline except for the briefest, completely
439 // generic methods that don't have any architecture-specific overloads.
440 // After the class defintions, there will be an immense pile of full
441 // implementation definitions, which casual users are not expected to
442 // understand.
443 //////////////////////////////////////////////////////////////////////////
444 //////////////////////////////////////////////////////////////////////////
445 
446 
447 /// vbool4: An 4-vector whose elements act mostly like bools, accelerated by
448 /// SIMD instructions when available. This is what is naturally produced by
449 /// SIMD comparison operators on the vfloat4 and vint4 types.
450 class vbool4 {
451 public:
452     static const char* type_name() { return "vbool4"; }
453     typedef bool value_t;        ///< Underlying equivalent scalar value type
454     enum { elements = 4 };       ///< Number of scalar elements
455     enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
456     enum { bits = elements*32 }; ///< Total number of bits
457     typedef simd_bool_t<4>::type simd_t;  ///< the native SIMD type used
458 
459     /// Default constructor (contents undefined)
460     vbool4 () { }
461 
462     /// Construct from a single value (store it in all slots)
463     vbool4 (bool a) { load(a); }
464 
465     explicit vbool4 (const bool *a);
466 
467     /// Construct from 4 bool values
468     vbool4 (bool a, bool b, bool c, bool d) { load (a, b, c, d); }
469 
470     /// Copy construct from another vbool4
471     vbool4 (const vbool4 &other) { m_simd = other.m_simd; }
472 
473     /// Construct from 4 int values
474     vbool4 (int a, int b, int c, int d) {
475         load (bool(a), bool(b), bool(c), bool(d));
476     }
477 
478     /// Construct from a SIMD int (is each element nonzero?)
479     vbool4 (const vint4 &i);
480 
481     /// Construct from the underlying SIMD type
482     vbool4 (const simd_t& m) : m_simd(m) { }
483 
484     /// Return the raw SIMD type
485     operator simd_t () const { return m_simd; }
486     simd_t simd () const { return m_simd; }
487     simd_t& simd () { return m_simd; }
488 
489     /// Extract the bitmask
490     int bitmask () const;
491 
492     /// Convert from integer bitmask to a true vbool4
493     static vbool4 from_bitmask (int bitmask);
494 
495     /// Set all components to false
496     void clear ();
497 
498     /// Return a vbool4 the is 'false' for all values
499     static const vbool4 False ();
500 
501     /// Return a vbool4 the is 'true' for all values
502     static const vbool4 True ();
503 
504     /// Assign one value to all components
505     const vbool4 & operator= (bool a) { load(a); return *this; }
506 
507     /// Assignment of another vbool4
508     const vbool4 & operator= (const vbool4 & other);
509 
510     /// Component access (get)
511     int operator[] (int i) const;
512 
513     /// Component access (set).
514     void setcomp (int i, bool value);
515 
516     /// Component access (set).
517     /// NOTE: avoid this unsafe construct. It will go away some day.
518     int& operator[] (int i);
519 
520     /// Helper: load a single value into all components.
521     void load (bool a);
522 
523     /// Helper: load separate values into each component.
524     void load (bool a, bool b, bool c, bool d);
525 
526     /// Helper: store the values into memory as bools.
527     void store (bool *values) const;
528 
529     /// Store the first n values into memory.
530     void store (bool *values, int n) const;
531 
532     /// Logical/bitwise operators, component-by-component
533     friend vbool4 operator! (const vbool4& a);
534     friend vbool4 operator& (const vbool4& a, const vbool4& b);
535     friend vbool4 operator| (const vbool4& a, const vbool4& b);
536     friend vbool4 operator^ (const vbool4& a, const vbool4& b);
537     friend vbool4 operator~ (const vbool4& a);
538     friend const vbool4& operator&= (vbool4& a, const vbool4& b);
539     friend const vbool4& operator|= (vbool4& a, const vbool4& b);
540     friend const vbool4& operator^= (vbool4& a, const vbool4& b);
541 
542     /// Comparison operators, component by component
543     friend vbool4 operator== (const vbool4& a, const vbool4& b);
544     friend vbool4 operator!= (const vbool4& a, const vbool4& b);
545 
546     /// Stream output
547     friend std::ostream& operator<< (std::ostream& cout, const vbool4 & a);
548 
549 private:
550     // The actual data representation
551     union {
552         simd_t m_simd;
553         int m_val[paddedelements];
554     };
555 };
556 
557 
558 
559 /// Helper: shuffle/swizzle with constant (templated) indices.
560 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
561 template<int i0, int i1, int i2, int i3> vbool4 shuffle (const vbool4& a);
562 
563 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
564 template<int i> vbool4 shuffle (const vbool4& a);
565 
566 /// Helper: as rapid as possible extraction of one component, when the
567 /// index is fixed.
568 template<int i> bool extract (const vbool4& a);
569 
570 /// Helper: substitute val for a[i]
571 template<int i> vbool4 insert (const vbool4& a, bool val);
572 
573 /// Logical reduction across all components.
574 bool reduce_and (const vbool4& v);
575 bool reduce_or (const vbool4& v);
576 
577 // Are all/any/no components true?
578 bool all (const vbool4& v);
579 bool any (const vbool4& v);
580 bool none (const vbool4& v);
581 
582 // It's handy to have this defined for regular bool as well
583 inline bool all (bool v) { return v; }
584 
585 
586 
587 /// vbool8: An 8-vector whose elements act mostly like bools, accelerated by
588 /// SIMD instructions when available. This is what is naturally produced by
589 /// SIMD comparison operators on the vfloat8 and vint8 types.
590 class vbool8 {
591 public:
592     static const char* type_name() { return "vbool8"; }
593     typedef bool value_t;        ///< Underlying equivalent scalar value type
594     enum { elements = 8 };       ///< Number of scalar elements
595     enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
596     enum { bits = elements*32 }; ///< Total number of bits
597     typedef simd_bool_t<8>::type simd_t;  ///< the native SIMD type used
598 
599     /// Default constructor (contents undefined)
600     vbool8 () { }
601 
602     /// Construct from a single value (store it in all slots)
603     vbool8 (bool a) { load (a); }
604 
605     explicit vbool8 (const bool *values);
606 
607     /// Construct from 8 bool values
608     vbool8 (bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h);
609 
610     /// Copy construct from another vbool8
611     vbool8 (const vbool8 &other) { m_simd = other.m_simd; }
612 
613     /// Construct from 8 int values
614     vbool8 (int a, int b, int c, int d, int e, int f, int g, int h);
615 
616     /// Construct from a SIMD int (is each element nonzero?)
617     vbool8 (const vint8 &i);
618 
619     /// Construct from two vbool4's
620     vbool8 (const vbool4 &lo, const vbool4 &hi);
621 
622     /// Construct from the underlying SIMD type
623     vbool8 (const simd_t& m) : m_simd(m) { }
624 
625     /// Return the raw SIMD type
626     operator simd_t () const { return m_simd; }
627     simd_t simd () const { return m_simd; }
628     simd_t& simd () { return m_simd; }
629 
630     /// Extract the bitmask
631     int bitmask () const;
632 
633     /// Convert from integer bitmask to a true vbool8
634     static vbool8 from_bitmask (int bitmask);
635 
636     /// Set all components to false
637     void clear ();
638 
639     /// Return a vbool8 the is 'false' for all values
640     static const vbool8 False ();
641 
642     /// Return a vbool8 the is 'true' for all values
643     static const vbool8 True ();
644 
645     /// Assign one value to all components
646     const vbool8 & operator= (bool a);
647 
648     /// Assignment of another vbool8
649     const vbool8 & operator= (const vbool8 & other);
650 
651     /// Component access (get)
652     int operator[] (int i) const;
653 
654     /// Component access (set).
655     void setcomp (int i, bool value);
656 
657     /// Component access (set).
658     /// NOTE: avoid this unsafe construct. It will go away some day.
659     int& operator[] (int i);
660 
661     /// Extract the lower precision vbool4
662     vbool4 lo () const;
663 
664     /// Extract the higher precision vbool4
665     vbool4 hi () const;
666 
667     /// Helper: load a single value into all components.
668     void load (bool a);
669 
670     /// Helper: load separate values into each component.
671     void load (bool a, bool b, bool c, bool d,
672                bool e, bool f, bool g, bool h);
673 
674     /// Helper: store the values into memory as bools.
675     void store (bool *values) const;
676 
677     /// Store the first n values into memory.
678     void store (bool *values, int n) const;
679 
680     /// Logical/bitwise operators, component-by-component
681     friend vbool8 operator! (const vbool8& a);
682     friend vbool8 operator& (const vbool8& a, const vbool8& b);
683     friend vbool8 operator| (const vbool8& a, const vbool8& b);
684     friend vbool8 operator^ (const vbool8& a, const vbool8& b);
685     friend vbool8 operator~ (const vbool8& a);
686     friend const vbool8& operator&= (vbool8& a, const vbool8& b);
687     friend const vbool8& operator|= (vbool8& a, const vbool8& b);
688     friend const vbool8& operator^= (vbool8& a, const vbool8& b);
689 
690     /// Comparison operators, component by component
691     friend vbool8 operator== (const vbool8& a, const vbool8& b);
692     friend vbool8 operator!= (const vbool8& a, const vbool8& b);
693 
694     /// Stream output
695     friend std::ostream& operator<< (std::ostream& cout, const vbool8 & a);
696 
697 private:
698     // The actual data representation
699     union {
700         simd_t m_simd;
701         int m_val[paddedelements];
702         vbool4 m_4[2];
703     };
704 };
705 
706 
707 
708 /// Helper: shuffle/swizzle with constant (templated) indices.
709 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
710 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
711 vbool8 shuffle (const vbool8& a);
712 
713 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
714 template<int i> vbool8 shuffle (const vbool8& a);
715 
716 /// Helper: as rapid as possible extraction of one component, when the
717 /// index is fixed.
718 template<int i> bool extract (const vbool8& a);
719 
720 /// Helper: substitute val for a[i]
721 template<int i> vbool8 insert (const vbool8& a, bool val);
722 
723 /// Logical reduction across all components.
724 bool reduce_and (const vbool8& v);
725 bool reduce_or (const vbool8& v);
726 
727 // Are all/any/no components true?
728 bool all (const vbool8& v);
729 bool any (const vbool8& v);
730 bool none (const vbool8& v);
731 
732 
733 
734 
735 /// vbool16: An 16-vector whose elements act mostly like bools, accelerated
736 /// by SIMD instructions when available. This is what is naturally produced
737 /// by SIMD comparison operators on the vfloat16 and vint16 types.
738 class vbool16 {
739 public:
740     static const char* type_name() { return "vbool16"; }
741     typedef bool value_t;        ///< Underlying equivalent scalar value type
742     enum { elements = 16 };       ///< Number of scalar elements
743     enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
744     enum { bits = 16 };           ///< Total number of bits
745     typedef simd_bool_t<16>::type simd_t;  ///< the native SIMD type used
746 
747     /// Default constructor (contents undefined)
748     vbool16 () { }
749 
750     /// Construct from a single value (store it in all slots)
751     vbool16 (bool a) { load (a); }
752 
753     explicit vbool16 (int bitmask) { load_bitmask (bitmask); }
754 
755     explicit vbool16 (const bool *values);
756 
757     /// Construct from 16 bool values
758     vbool16 (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
759             bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
760 
761     /// Copy construct from another vbool16
762     vbool16 (const vbool16 &other) { m_simd = other.m_simd; }
763 
764     /// Construct from 16 int values
765     vbool16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
766              int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
767 
768     /// Construct from a SIMD int (is each element nonzero?)
769     vbool16 (const vint16 &i);
770 
771     /// Construct from two vbool8's
772     vbool16 (const vbool8 &lo, const vbool8 &hi);
773 
774     /// Construct from four vbool4's
775     vbool16 (const vbool4 &b4a, const vbool4 &b4b, const vbool4 &b4c, const vbool4 &b4d);
776 
777     /// Construct from the underlying SIMD type
778     vbool16 (const simd_t& m) : m_simd(m) { }
779 
780     /// Return the raw SIMD type
781     operator simd_t () const { return m_simd; }
782     simd_t simd () const { return m_simd; }
783     simd_t& simd () { return m_simd; }
784 
785     int bitmask () const;
786 
787     /// Convert from integer bitmask to a true vbool16
788     static vbool16 from_bitmask (int bitmask) { return vbool16(bitmask); }
789 
790     /// Set all components to false
791     void clear ();
792 
793     /// Return a vbool16 the is 'false' for all values
794     static const vbool16 False ();
795 
796     /// Return a vbool16 the is 'true' for all values
797     static const vbool16 True ();
798 
799     /// Assign one value to all components
800     const vbool16 & operator= (bool a);
801 
802     /// Assignment of another vbool16
803     const vbool16 & operator= (const vbool16 & other);
804 
805     /// Component access (get)
806     int operator[] (int i) const;
807 
808     /// Component access (set).
809     void setcomp (int i, bool value);
810 
811     /// Extract the lower precision vbool8
812     vbool8 lo () const;
813 
814     /// Extract the higher precision vbool8
815     vbool8 hi () const;
816 
817     /// Helper: load a single value into all components.
818     void load (bool a);
819 
820     /// Helper: load separate values into each component.
821     void load (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
822                bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
823 
824     /// Helper: load all components from a bitmask in an int.
825     void load_bitmask (int a);
826 
827     /// Helper: store the values into memory as bools.
828     void store (bool *values) const;
829 
830     /// Store the first n values into memory.
831     void store (bool *values, int n) const;
832 
833     /// Logical/bitwise operators, component-by-component
834     friend vbool4 operator! (const vbool4& a);
835     friend vbool16 operator! (const vbool16& a);
836     friend vbool16 operator& (const vbool16& a, const vbool16& b);
837     friend vbool16 operator| (const vbool16& a, const vbool16& b);
838     friend vbool16 operator^ (const vbool16& a, const vbool16& b);
839     friend vbool16 operator~ (const vbool16& a);
840     friend const vbool16& operator&= (vbool16& a, const vbool16& b);
841     friend const vbool16& operator|= (vbool16& a, const vbool16& b);
842     friend const vbool16& operator^= (vbool16& a, const vbool16& b);
843 
844     /// Comparison operators, component by component
845     friend vbool16 operator== (const vbool16& a, const vbool16& b);
846     friend vbool16 operator!= (const vbool16& a, const vbool16& b);
847 
848     /// Stream output
849     friend std::ostream& operator<< (std::ostream& cout, const vbool16 & a);
850 
851 private:
852     // The actual data representation
853     union {
854         simd_t   m_simd;
855         uint16_t m_bits;
856     };
857 };
858 
859 
860 
861 /// Helper: as rapid as possible extraction of one component, when the
862 /// index is fixed.
863 template<int i> bool extract (const vbool16& a);
864 
865 /// Helper: substitute val for a[i]
866 template<int i> vbool16 insert (const vbool16& a, bool val);
867 
868 /// Logical reduction across all components.
869 bool reduce_and (const vbool16& v);
870 bool reduce_or (const vbool16& v);
871 
872 // Are all/any/no components true?
873 bool all (const vbool16& v);
874 bool any (const vbool16& v);
875 bool none (const vbool16& v);
876 
877 
878 
879 
880 
881 /// Integer 4-vector, accelerated by SIMD instructions when available.
882 class vint4 {
883 public:
884     static const char* type_name() { return "vint4"; }
885     typedef int value_t;      ///< Underlying equivalent scalar value type
886     enum { elements = 4 };    ///< Number of scalar elements
887     enum { paddedelements =4 }; ///< Number of scalar elements for full pad
888     enum { bits = 128 };      ///< Total number of bits
889     typedef simd_raw_t<int,elements>::type simd_t;  ///< the native SIMD type used
890     typedef vbool4 vbool_t;   ///< bool type of the same length
891     typedef vfloat4 vfloat_t; ///< float type of the same length
892     typedef vint4 vint_t;     ///< int type of the same length
893     typedef vbool4 bool_t;   // old name (deprecated 1.8)
894     typedef vfloat4 float_t; // old name (deprecated 1.8)
895 
896     /// Default constructor (contents undefined)
897     vint4 () { }
898 
899     /// Construct from a single value (store it in all slots)
900     vint4 (int a);
901 
902     /// Construct from 2 values -- (a,a,b,b)
903     vint4 (int a, int b);
904 
905     /// Construct from 4 values
906     vint4 (int a, int b, int c, int d);
907 
908     /// Construct from a pointer to values
909     vint4 (const int *vals);
910 
911     /// Construct from a pointer to unsigned short values
912     explicit vint4 (const unsigned short *vals);
913 
914     /// Construct from a pointer to signed short values
915     explicit vint4 (const short *vals);
916 
917     /// Construct from a pointer to unsigned char values (0 - 255)
918     explicit vint4 (const unsigned char *vals);
919 
920     /// Construct from a pointer to signed char values (-128 - 127)
921     explicit vint4 (const char *vals);
922 
923     /// Copy construct from another vint4
924     vint4 (const vint4 & other) { m_simd = other.m_simd; }
925 
926     /// Convert a vfloat to an vint. Equivalent to i = (int)f;
927     explicit vint4 (const vfloat4& f); // implementation below
928 
929     /// Construct from the underlying SIMD type
930     vint4 (const simd_t& m) : m_simd(m) { }
931 
932     /// Return the raw SIMD type
933     operator simd_t () const { return m_simd; }
934     simd_t simd () const { return m_simd; }
935     simd_t& simd () { return m_simd; }
936 
937     /// Return a pointer to the underlying scalar type
938     const value_t* data () const { return (const value_t*)this; }
939     value_t* data () { return (value_t*)this; }
940 
941     /// Sset all components to 0
942     void clear () ;
943 
944     /// Return an vint4 with all components set to 0
945     static const vint4 Zero ();
946 
947     /// Return an vint4 with all components set to 1
948     static const vint4 One ();
949 
950     /// Return an vint4 with all components set to -1 (aka 0xffffffff)
951     static const vint4 NegOne ();
952 
953     /// Return an vint4 with incremented components (e.g., 0,1,2,3).
954     /// Optional arguments can give a non-zero starting point and step size.
955     static const vint4 Iota (int start=0, int step=1);
956 
957     /// Return an vint4 with "geometric" iota: (1, 2, 4, 8).
958     static const vint4 Giota ();
959 
960     /// Assign one value to all components.
961     const vint4 & operator= (int a);
962 
963     /// Assignment from another vint4
964     const vint4 & operator= (const vint4& other) ;
965 
966     /// Component access (get)
967     int operator[] (int i) const;
968 
969     /// Component access (set)
970     int& operator[] (int i);
971 
972     /// Component access (set).
973     void setcomp (int i, int value);
974 
975     value_t x () const;
976     value_t y () const;
977     value_t z () const;
978     value_t w () const;
979     void set_x (value_t val);
980     void set_y (value_t val);
981     void set_z (value_t val);
982     void set_w (value_t val);
983 
984     /// Helper: load a single int into all components
985     void load (int a);
986 
987     /// Helper: load separate values into each component.
988     void load (int a, int b, int c, int d);
989 
990     /// Load from an array of 4 values
991     void load (const int *values);
992 
993     void load (const int *values, int n) ;
994 
995     /// Load from an array of 4 unsigned short values, convert to vint4
996     void load (const unsigned short *values) ;
997 
998     /// Load from an array of 4 unsigned short values, convert to vint4
999     void load (const short *values);
1000 
1001     /// Load from an array of 4 unsigned char values, convert to vint4
1002     void load (const unsigned char *values);
1003 
1004     /// Load from an array of 4 unsigned char values, convert to vint4
1005     void load (const char *values);
1006 
1007     /// Store the values into memory
1008     void store (int *values) const;
1009 
1010     /// Store the first n values into memory
1011     void store (int *values, int n) const;
1012 
1013     /// Store the least significant 16 bits of each element into adjacent
1014     /// unsigned shorts.
1015     void store (unsigned short *values) const;
1016 
1017     /// Store the least significant 8 bits of each element into adjacent
1018     /// unsigned chars.
1019     void store (unsigned char *values) const;
1020 
1021     /// Masked load -- read from values[] where mask is 1, load zero where
1022     /// mask is 0.
1023     void load_mask (int mask, const value_t *values);
1024     void load_mask (const vbool_t& mask, const value_t *values);
1025 
1026     /// Masked store -- write to values[] where mask is enabled, don't
1027     /// touch values[] where it's not.
1028     void store_mask (int mask, value_t *values) const;
1029     void store_mask (const vbool_t& mask, value_t *values) const;
1030 
1031     /// Load values from addresses  (char*)basepatr + vindex[i]*scale
1032     template<int scale=4>
1033     void gather (const value_t *baseptr, const vint_t& vindex);
1034     /// Gather elements defined by the mask, leave others unchanged.
1035     template<int scale=4>
1036     void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1037     template<int scale=4>
1038     void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1039 
1040     /// Store values at addresses  (char*)basepatr + vindex[i]*scale
1041     template<int scale=4>
1042     void scatter (value_t *baseptr, const vint_t& vindex) const;
1043     /// Scatter elements defined by the mask
1044     template<int scale=4>
1045     void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1046     template<int scale=4>
1047     void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1048 
1049     // Arithmetic operators (component-by-component)
1050     friend vint4 operator+ (const vint4& a, const vint4& b);
1051     friend vint4 operator- (const vint4& a);
1052     friend vint4 operator- (const vint4& a, const vint4& b);
1053     friend vint4 operator* (const vint4& a, const vint4& b);
1054     friend vint4 operator/ (const vint4& a, const vint4& b);
1055     friend vint4 operator% (const vint4& a, const vint4& b);
1056     friend const vint4 & operator+= (vint4& a, const vint4& b);
1057     friend const vint4 & operator-= (vint4& a, const vint4& b);
1058     friend const vint4 & operator*= (vint4& a, const vint4& b);
1059     friend const vint4 & operator/= (vint4& a, const vint4& b);
1060     friend const vint4 & operator%= (vint4& a, const vint4& b);
1061     // Bitwise operators (component-by-component)
1062     friend vint4 operator& (const vint4& a, const vint4& b);
1063     friend vint4 operator| (const vint4& a, const vint4& b);
1064     friend vint4 operator^ (const vint4& a, const vint4& b);
1065     friend const vint4& operator&= (vint4& a, const vint4& b);
1066     friend const vint4& operator|= (vint4& a, const vint4& b);
1067     friend const vint4& operator^= (vint4& a, const vint4& b);
1068     friend vint4 operator~ (const vint4& a);
1069     friend vint4 operator<< (const vint4& a, unsigned int bits);
1070     friend vint4 operator>> (const vint4& a, unsigned int bits);
1071     friend const vint4& operator<<= (vint4& a, unsigned int bits);
1072     friend const vint4& operator>>= (vint4& a, unsigned int bits);
1073     // Comparison operators (component-by-component)
1074     friend vbool4 operator== (const vint4& a, const vint4& b);
1075     friend vbool4 operator!= (const vint4& a, const vint4& b);
1076     friend vbool4 operator<  (const vint4& a, const vint4& b);
1077     friend vbool4 operator>  (const vint4& a, const vint4& b);
1078     friend vbool4 operator>= (const vint4& a, const vint4& b);
1079     friend vbool4 operator<= (const vint4& a, const vint4& b);
1080 
1081     /// Stream output
1082     friend std::ostream& operator<< (std::ostream& cout, const vint4 & a);
1083 
1084 private:
1085     // The actual data representation
1086     union {
1087         simd_t  m_simd;
1088         value_t m_val[elements];
1089     };
1090 };
1091 
1092 
1093 
1094 // Shift right logical -- unsigned shift. This differs from operator>>
1095 // in how it handles the sign bit.  (1<<31) >> 1 == (1<<31), but
1096 // srl((1<<31),1) == 1<<30.
1097 vint4 srl (const vint4& val, const unsigned int bits);
1098 
1099 /// Helper: shuffle/swizzle with constant (templated) indices.
1100 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1101 template<int i0, int i1, int i2, int i3> vint4 shuffle (const vint4& a);
1102 
1103 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1104 template<int i> vint4 shuffle (const vint4& a);
1105 
1106 /// Helper: as rapid as possible extraction of one component, when the
1107 /// index is fixed.
1108 template<int i> int extract (const vint4& v);
1109 
1110 /// The sum of all components, returned in all components.
1111 vint4 vreduce_add (const vint4& v);
1112 
1113 // Reduction across all components
1114 int reduce_add (const vint4& v);
1115 int reduce_and (const vint4& v);
1116 int reduce_or (const vint4& v);
1117 
1118 /// Use a bool mask to select between components of a (if mask[i] is false)
1119 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1120 vint4 blend (const vint4& a, const vint4& b, const vbool4& mask);
1121 
1122 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1123 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1124 /// blend(0,a,mask).
1125 vint4 blend0 (const vint4& a, const vbool4& mask);
1126 
1127 /// Use a bool mask to select between components of a (if mask[i] is false)
1128 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1129 /// blend(0,a,!mask), or blend(a,0,mask).
1130 vint4 blend0not (const vint4& a, const vbool4& mask);
1131 
1132 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1133 /// synonym for blend with arguments rearranged, but this is more clear
1134 /// because the arguments are symmetric to scalar (cond ? a : b).
1135 vint4 select (const vbool4& mask, const vint4& a, const vint4& b);
1136 
1137 // Per-element math
1138 vint4 abs (const vint4& a);
1139 vint4 min (const vint4& a, const vint4& b);
1140 vint4 max (const vint4& a, const vint4& b);
1141 
1142 /// Circular bit rotate by s bits, for N values at once.
1143 vint4 rotl (const vint4& x, const int s);
1144 // DEPRECATED(2.1)
1145 vint4 rotl32 (const vint4& x, const unsigned int k);
1146 
1147 /// andnot(a,b) returns ((~a) & b)
1148 vint4 andnot (const vint4& a, const vint4& b);
1149 
1150 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1151 vint4 bitcast_to_int (const vbool4& x);
1152 vint4 bitcast_to_int (const vfloat4& x);
1153 vfloat4 bitcast_to_float (const vint4& x);
1154 
1155 void transpose (vint4 &a, vint4 &b, vint4 &c, vint4 &d);
1156 void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d,
1157                 vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3);
1158 
1159 vint4 AxBxCxDx (const vint4& a, const vint4& b, const vint4& c, const vint4& d);
1160 
1161 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1162 vint4 safe_mod (const vint4& a, const vint4& b);
1163 vint4 safe_mod (const vint4& a, int b);
1164 
1165 
1166 
1167 
1168 /// Integer 8-vector, accelerated by SIMD instructions when available.
1169 class vint8 {
1170 public:
1171     static const char* type_name() { return "vint8"; }
1172     typedef int value_t;      ///< Underlying equivalent scalar value type
1173     enum { elements = 8 };    ///< Number of scalar elements
1174     enum { paddedelements =8 }; ///< Number of scalar elements for full pad
1175     enum { bits = elements*32 }; ///< Total number of bits
1176     typedef simd_raw_t<int,elements>::type simd_t;  ///< the native SIMD type used
1177     typedef vbool8 vbool_t;   ///< bool type of the same length
1178     typedef vfloat8 vfloat_t; ///< float type of the same length
1179     typedef vint8 vint_t;     ///< int type of the same length
1180     typedef vbool8 bool_t;   // old name (deprecated 1.8)
1181     typedef vfloat8 float_t; // old name (deprecated 1.8)
1182 
1183     /// Default constructor (contents undefined)
1184     vint8 () { }
1185 
1186     /// Construct from a single value (store it in all slots)
1187     vint8 (int a);
1188 
1189     /// Construct from 2 values -- (a,a,b,b)
1190     vint8 (int a, int b);
1191 
1192     /// Construct from 8 values (won't work for vint8)
1193     vint8 (int a, int b, int c, int d, int e, int f, int g, int h);
1194 
1195     /// Construct from a pointer to values
1196     vint8 (const int *vals);
1197 
1198     /// Construct from a pointer to unsigned short values
1199     explicit vint8 (const unsigned short *vals);
1200 
1201     /// Construct from a pointer to signed short values
1202     explicit vint8 (const short *vals);
1203 
1204     /// Construct from a pointer to unsigned char values (0 - 255)
1205     explicit vint8 (const unsigned char *vals);
1206 
1207     /// Construct from a pointer to signed char values (-128 - 127)
1208     explicit vint8 (const char *vals);
1209 
1210     /// Copy construct from another vint8
1211     vint8 (const vint8 & other) { m_simd = other.m_simd; }
1212 
1213     /// Convert a vfloat8 to an vint8. Equivalent to i = (int)f;
1214     explicit vint8 (const vfloat8& f); // implementation below
1215 
1216     /// Construct from two vint4's
1217     vint8 (const vint4 &lo, const vint4 &hi);
1218 
1219     /// Construct from the underlying SIMD type
1220     vint8 (const simd_t& m) : m_simd(m) { }
1221 
1222     /// Return the raw SIMD type
1223     operator simd_t () const { return m_simd; }
1224     simd_t simd () const { return m_simd; }
1225     simd_t& simd () { return m_simd; }
1226 
1227     /// Return a pointer to the underlying scalar type
1228     const value_t* data () const { return (const value_t*)this; }
1229     value_t* data () { return (value_t*)this; }
1230 
1231     /// Sset all components to 0
1232     void clear () ;
1233 
1234     /// Return an vint8 with all components set to 0
1235     static const vint8 Zero ();
1236 
1237     /// Return an vint8 with all components set to 1
1238     static const vint8 One ();
1239 
1240     /// Return an vint8 with all components set to -1 (aka 0xffffffff)
1241     static const vint8 NegOne ();
1242 
1243     /// Return an vint8 with incremented components (e.g., 0,1,2,3).
1244     /// Optional arguments can give a non-zero starting point and step size.
1245     static const vint8 Iota (int start=0, int step=1);
1246 
1247     /// Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...).
1248     static const vint8 Giota ();
1249 
1250     /// Assign one value to all components.
1251     const vint8 & operator= (int a);
1252 
1253     /// Assignment from another vint8
1254     const vint8 & operator= (const vint8& other) ;
1255 
1256     /// Component access (get)
1257     int operator[] (int i) const;
1258 
1259     /// Component access (set)
1260     int& operator[] (int i);
1261 
1262     /// Component access (set).
1263     void setcomp (int i, int value);
1264 
1265     value_t x () const;
1266     value_t y () const;
1267     value_t z () const;
1268     value_t w () const;
1269     void set_x (value_t val);
1270     void set_y (value_t val);
1271     void set_z (value_t val);
1272     void set_w (value_t val);
1273 
1274     /// Extract the lower precision vint4
1275     vint4 lo () const;
1276 
1277     /// Extract the higher precision vint4
1278     vint4 hi () const;
1279 
1280     /// Helper: load a single int into all components
1281     void load (int a);
1282 
1283     /// Load separate values into each component.
1284     void load (int a, int b, int c, int d, int e, int f, int g, int h);
1285 
1286     /// Load from an array of 8 values
1287     void load (const int *values);
1288 
1289     void load (const int *values, int n) ;
1290 
1291     /// Load from an array of 8 unsigned short values, convert to vint8
1292     void load (const unsigned short *values) ;
1293 
1294     /// Load from an array of 8 unsigned short values, convert to vint8
1295     void load (const short *values);
1296 
1297     /// Load from an array of 8 unsigned char values, convert to vint8
1298     void load (const unsigned char *values);
1299 
1300     /// Load from an array of 8 unsigned char values, convert to vint8
1301     void load (const char *values);
1302 
1303     /// Store the values into memory
1304     void store (int *values) const;
1305 
1306     /// Store the first n values into memory
1307     void store (int *values, int n) const;
1308 
1309     /// Store the least significant 16 bits of each element into adjacent
1310     /// unsigned shorts.
1311     void store (unsigned short *values) const;
1312 
1313     /// Store the least significant 8 bits of each element into adjacent
1314     /// unsigned chars.
1315     void store (unsigned char *values) const;
1316 
1317     /// Masked load -- read from values[] where mask is 1, load zero where
1318     /// mask is 0.
1319     void load_mask (int mask, const value_t *values);
1320     void load_mask (const vbool_t& mask, const value_t *values);
1321 
1322     /// Masked store -- write to values[] where mask is enabled, don't
1323     /// touch values[] where it's not.
1324     void store_mask (int mask, value_t *values) const;
1325     void store_mask (const vbool_t& mask, value_t *values) const;
1326 
1327     /// Load values from addresses  (char*)basepatr + vindex[i]*scale
1328     template<int scale=4>
1329     void gather (const value_t *baseptr, const vint_t& vindex);
1330     /// Gather elements defined by the mask, leave others unchanged.
1331     template<int scale=4>
1332     void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1333     template<int scale=4>
1334     void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1335 
1336     /// Store values at addresses  (char*)basepatr + vindex[i]*scale
1337     template<int scale=4>
1338     void scatter (value_t *baseptr, const vint_t& vindex) const;
1339     /// Scatter elements defined by the mask
1340     template<int scale=4>
1341     void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1342     template<int scale=4>
1343     void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1344 
1345     // Arithmetic operators (component-by-component)
1346     friend vint8 operator+ (const vint8& a, const vint8& b);
1347     friend vint8 operator- (const vint8& a);
1348     friend vint8 operator- (const vint8& a, const vint8& b);
1349     friend vint8 operator* (const vint8& a, const vint8& b);
1350     friend vint8 operator/ (const vint8& a, const vint8& b);
1351     friend vint8 operator% (const vint8& a, const vint8& b);
1352     friend const vint8 & operator+= (vint8& a, const vint8& b);
1353     friend const vint8 & operator-= (vint8& a, const vint8& b);
1354     friend const vint8 & operator*= (vint8& a, const vint8& b);
1355     friend const vint8 & operator/= (vint8& a, const vint8& b);
1356     friend const vint8 & operator%= (vint8& a, const vint8& b);
1357     // Bitwise operators (component-by-component)
1358     friend vint8 operator& (const vint8& a, const vint8& b);
1359     friend vint8 operator| (const vint8& a, const vint8& b);
1360     friend vint8 operator^ (const vint8& a, const vint8& b);
1361     friend const vint8& operator&= (vint8& a, const vint8& b);
1362     friend const vint8& operator|= (vint8& a, const vint8& b);
1363     friend const vint8& operator^= (vint8& a, const vint8& b);
1364     friend vint8 operator~ (const vint8& a);
1365     friend vint8 operator<< (const vint8& a, unsigned int bits);
1366     friend vint8 operator>> (const vint8& a, unsigned int bits);
1367     friend const vint8& operator<<= (vint8& a, unsigned int bits);
1368     friend const vint8& operator>>= (vint8& a, unsigned int bits);
1369     // Comparison operators (component-by-component)
1370     friend vbool8 operator== (const vint8& a, const vint8& b);
1371     friend vbool8 operator!= (const vint8& a, const vint8& b);
1372     friend vbool8 operator<  (const vint8& a, const vint8& b);
1373     friend vbool8 operator>  (const vint8& a, const vint8& b);
1374     friend vbool8 operator>= (const vint8& a, const vint8& b);
1375     friend vbool8 operator<= (const vint8& a, const vint8& b);
1376 
1377     /// Stream output
1378     friend std::ostream& operator<< (std::ostream& cout, const vint8& a);
1379 
1380 private:
1381     // The actual data representation
1382     union {
1383         simd_t  m_simd;
1384         value_t m_val[elements];
1385         vint4 m_4[2];
1386     };
1387 };
1388 
1389 
1390 
1391 // Shift right logical -- unsigned shift. This differs from operator>>
1392 // in how it handles the sign bit.  (1<<31) >> 1 == (1<<31), but
1393 // srl((1<<31),1) == 1<<30.
1394 vint8 srl (const vint8& val, const unsigned int bits);
1395 
1396 /// Helper: shuffle/swizzle with constant (templated) indices.
1397 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1398 template<int i0, int i1, int i2, int i3,
1399          int i4, int i5, int i6, int i7> vint8 shuffle (const vint8& a);
1400 
1401 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1402 template<int i> vint8 shuffle (const vint8& a);
1403 
1404 /// Helper: as rapid as possible extraction of one component, when the
1405 /// index is fixed.
1406 template<int i> int extract (const vint8& v);
1407 
1408 /// Helper: substitute val for a[i]
1409 template<int i> vint8 insert (const vint8& a, int val);
1410 
1411 /// The sum of all components, returned in all components.
1412 vint8 vreduce_add (const vint8& v);
1413 
1414 // Reduction across all components
1415 int reduce_add (const vint8& v);
1416 int reduce_and (const vint8& v);
1417 int reduce_or (const vint8& v);
1418 
1419 /// Use a bool mask to select between components of a (if mask[i] is false)
1420 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1421 vint8 blend (const vint8& a, const vint8& b, const vbool8& mask);
1422 
1423 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1424 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1425 /// blend(0,a,mask).
1426 vint8 blend0 (const vint8& a, const vbool8& mask);
1427 
1428 /// Use a bool mask to select between components of a (if mask[i] is false)
1429 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1430 /// blend(0,a,!mask), or blend(a,0,mask).
1431 vint8 blend0not (const vint8& a, const vbool8& mask);
1432 
1433 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1434 /// synonym for blend with arguments rearranged, but this is more clear
1435 /// because the arguments are symmetric to scalar (cond ? a : b).
1436 vint8 select (const vbool8& mask, const vint8& a, const vint8& b);
1437 
1438 // Per-element math
1439 vint8 abs (const vint8& a);
1440 vint8 min (const vint8& a, const vint8& b);
1441 vint8 max (const vint8& a, const vint8& b);
1442 
1443 /// Circular bit rotate by s bits, for N values at once.
1444 vint8 rotl (const vint8& x, const int s);
1445 // DEPRECATED(2.1)
1446 vint8 rotl32 (const vint8& x, const unsigned int k);
1447 
1448 /// andnot(a,b) returns ((~a) & b)
1449 vint8 andnot (const vint8& a, const vint8& b);
1450 
1451 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1452 vint8 bitcast_to_int (const vbool8& x);
1453 vint8 bitcast_to_int (const vfloat8& x);
1454 vfloat8 bitcast_to_float (const vint8& x);
1455 
1456 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1457 vint8 safe_mod (const vint8& a, const vint8& b);
1458 vint8 safe_mod (const vint8& a, int b);
1459 
1460 
1461 
1462 
1463 
1464 /// Integer 16-vector, accelerated by SIMD instructions when available.
1465 class vint16 {
1466 public:
1467     static const char* type_name() { return "vint16"; }
1468     typedef int value_t;      ///< Underlying equivalent scalar value type
1469     enum { elements = 16 };    ///< Number of scalar elements
1470     enum { paddedelements =16 }; ///< Number of scalar elements for full pad
1471     enum { bits = 128 };      ///< Total number of bits
1472     typedef simd_raw_t<int,elements>::type simd_t;  ///< the native SIMD type used
1473     typedef vbool16 vbool_t;   ///< bool type of the same length
1474     typedef vfloat16 vfloat_t; ///< float type of the same length
1475     typedef vint16 vint_t;     ///< int type of the same length
1476     typedef vbool16 bool_t;   // old name (deprecated 1.8)
1477     typedef vfloat16 float_t; // old name (deprecated 1.8)
1478 
1479     /// Default constructor (contents undefined)
1480     vint16 () { }
1481 
1482     /// Construct from a single value (store it in all slots)
1483     vint16 (int a);
1484 
1485     /// Construct from 16 values (won't work for vint16)
1486     vint16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1487            int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1488 
1489     /// Construct from a pointer to values
1490     vint16 (const int *vals);
1491 
1492     /// Construct from a pointer to unsigned short values
1493     explicit vint16 (const unsigned short *vals);
1494 
1495     /// Construct from a pointer to signed short values
1496     explicit vint16 (const short *vals);
1497 
1498     /// Construct from a pointer to unsigned char values (0 - 255)
1499     explicit vint16 (const unsigned char *vals);
1500 
1501     /// Construct from a pointer to signed char values (-128 - 127)
1502     explicit vint16 (const char *vals);
1503 
1504     /// Copy construct from another vint16
1505     vint16 (const vint16 & other) { m_simd = other.m_simd; }
1506 
1507     /// Convert a vfloat16 to an vint16. Equivalent to i = (int)f;
1508     explicit vint16 (const vfloat16& f); // implementation below
1509 
1510     /// Construct from two vint8's
1511     vint16 (const vint8 &lo, const vint8 &hi);
1512 
1513     /// Construct from four vint4's
1514     vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d);
1515 
1516     /// Construct from the underlying SIMD type
1517     vint16 (const simd_t& m) : m_simd(m) { }
1518 
1519     /// Return the raw SIMD type
1520     operator simd_t () const { return m_simd; }
1521     simd_t simd () const { return m_simd; }
1522     simd_t& simd () { return m_simd; }
1523 
1524     /// Return a pointer to the underlying scalar type
1525     const value_t* data () const { return (const value_t*)this; }
1526     value_t* data () { return (value_t*)this; }
1527 
1528     /// Sset all components to 0
1529     void clear () ;
1530 
1531     /// Return an vint16 with all components set to 0
1532     static const vint16 Zero ();
1533 
1534     /// Return an vint16 with all components set to 1
1535     static const vint16 One ();
1536 
1537     /// Return an vint16 with all components set to -1 (aka 0xffffffff)
1538     static const vint16 NegOne ();
1539 
1540     /// Return an vint16 with incremented components (e.g., 0,1,2,3).
1541     /// Optional arguments can give a non-zero starting point and step size.
1542     static const vint16 Iota (int start=0, int step=1);
1543 
1544     /// Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...).
1545     static const vint16 Giota ();
1546 
1547     /// Assign one value to all components.
1548     const vint16 & operator= (int a);
1549 
1550     /// Assignment from another vint16
1551     const vint16 & operator= (const vint16& other) ;
1552 
1553     /// Component access (get)
1554     int operator[] (int i) const;
1555 
1556     /// Component access (set)
1557     int& operator[] (int i);
1558 
1559     /// Component access (set).
1560     void setcomp (int i, int value);
1561 
1562     value_t x () const;
1563     value_t y () const;
1564     value_t z () const;
1565     value_t w () const;
1566     void set_x (value_t val);
1567     void set_y (value_t val);
1568     void set_z (value_t val);
1569     void set_w (value_t val);
1570 
1571     /// Extract the lower precision vint8
1572     vint8 lo () const;
1573 
1574     /// Extract the higher precision vint8
1575     vint8 hi () const;
1576 
1577     /// Helper: load a single int into all components
1578     void load (int a);
1579 
1580     /// Load separate values into each component.
1581     void load (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1582                int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1583 
1584     /// Load from an array of 16 values
1585     void load (const int *values);
1586 
1587     void load (const int *values, int n) ;
1588 
1589     /// Load from an array of 16 unsigned short values, convert to vint16
1590     void load (const unsigned short *values) ;
1591 
1592     /// Load from an array of 16 unsigned short values, convert to vint16
1593     void load (const short *values);
1594 
1595     /// Load from an array of 16 unsigned char values, convert to vint16
1596     void load (const unsigned char *values);
1597 
1598     /// Load from an array of 16 unsigned char values, convert to vint16
1599     void load (const char *values);
1600 
1601     /// Store the values into memory
1602     void store (int *values) const;
1603 
1604     /// Store the first n values into memory
1605     void store (int *values, int n) const;
1606 
1607     /// Store the least significant 16 bits of each element into adjacent
1608     /// unsigned shorts.
1609     void store (unsigned short *values) const;
1610 
1611     /// Store the least significant 8 bits of each element into adjacent
1612     /// unsigned chars.
1613     void store (unsigned char *values) const;
1614 
1615     /// Masked load -- read from values[] where mask is 1, load zero where
1616     /// mask is 0.
1617     void load_mask (const vbool_t &mask, const value_t *values);
1618     void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
1619 
1620     /// Masked store -- write to values[] where mask is enabled, don't
1621     /// touch values[] where it's not.
1622     void store_mask (const vbool_t &mask, value_t *values) const;
1623     void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
1624 
1625     /// Load values from addresses  (char*)basepatr + vindex[i]*scale
1626     template<int scale=4>
1627     void gather (const value_t *baseptr, const vint_t& vindex);
1628     /// Gather elements defined by the mask, leave others unchanged.
1629     template<int scale=4>
1630     void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1631     template<int scale=4>
1632     void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
1633         gather_mask<scale> (vbool_t(mask), baseptr, vindex);
1634     }
1635 
1636     /// Store values at addresses  (char*)basepatr + vindex[i]*scale
1637     template<int scale=4>
1638     void scatter (value_t *baseptr, const vint_t& vindex) const;
1639     /// Scatter elements defined by the mask
1640     template<int scale=4>
1641     void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1642     template<int scale=4>
1643     void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
1644         scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
1645     }
1646 
1647     // Arithmetic operators (component-by-component)
1648     friend vint16 operator+ (const vint16& a, const vint16& b);
1649     friend vint16 operator- (const vint16& a);
1650     friend vint16 operator- (const vint16& a, const vint16& b);
1651     friend vint16 operator* (const vint16& a, const vint16& b);
1652     friend vint16 operator/ (const vint16& a, const vint16& b);
1653     friend vint16 operator% (const vint16& a, const vint16& b);
1654     friend const vint16 & operator+= (vint16& a, const vint16& b);
1655     friend const vint16 & operator-= (vint16& a, const vint16& b);
1656     friend const vint16 & operator*= (vint16& a, const vint16& b);
1657     friend const vint16 & operator/= (vint16& a, const vint16& b);
1658     friend const vint16 & operator%= (vint16& a, const vint16& b);
1659     // Bitwise operators (component-by-component)
1660     friend vint16 operator& (const vint16& a, const vint16& b);
1661     friend vint16 operator| (const vint16& a, const vint16& b);
1662     friend vint16 operator^ (const vint16& a, const vint16& b);
1663     friend const vint16& operator&= (vint16& a, const vint16& b);
1664     friend const vint16& operator|= (vint16& a, const vint16& b);
1665     friend const vint16& operator^= (vint16& a, const vint16& b);
1666     friend vint16 operator~ (const vint16& a);
1667     friend vint16 operator<< (const vint16& a, unsigned int bits);
1668     friend vint16 operator>> (const vint16& a, unsigned int bits);
1669     friend const vint16& operator<<= (vint16& a, unsigned int bits);
1670     friend const vint16& operator>>= (vint16& a, unsigned int bits);
1671     // Comparison operators (component-by-component)
1672     friend vbool16 operator== (const vint16& a, const vint16& b);
1673     friend vbool16 operator!= (const vint16& a, const vint16& b);
1674     friend vbool16 operator<  (const vint16& a, const vint16& b);
1675     friend vbool16 operator>  (const vint16& a, const vint16& b);
1676     friend vbool16 operator>= (const vint16& a, const vint16& b);
1677     friend vbool16 operator<= (const vint16& a, const vint16& b);
1678 
1679     /// Stream output
1680     friend std::ostream& operator<< (std::ostream& cout, const vint16& a);
1681 
1682 private:
1683     // The actual data representation
1684     union {
1685         simd_t  m_simd;
1686         value_t m_val[elements];
1687         vint8    m_8[2];
1688     };
1689 };
1690 
1691 
1692 
1693 /// Shift right logical -- unsigned shift. This differs from operator>>
1694 /// in how it handles the sign bit.  (1<<31) >> 1 == (1<<31), but
1695 /// srl((1<<31),1) == 1<<30.
1696 vint16 srl (const vint16& val, const unsigned int bits);
1697 
1698 /// Shuffle groups of 4
1699 template<int i0, int i1, int i2, int i3>
1700 vint16 shuffle4 (const vint16& a);
1701 
1702 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
1703 template<int i> vint16 shuffle4 (const vint16& a);
1704 
1705 /// Shuffle within each group of 4
1706 template<int i0, int i1, int i2, int i3>
1707 vint16 shuffle (const vint16& a);
1708 
1709 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1710 template<int i> vint16 shuffle (const vint16& a);
1711 
1712 /// Helper: as rapid as possible extraction of one component, when the
1713 /// index is fixed.
1714 template<int i> int extract (const vint16& v);
1715 
1716 /// Helper: substitute val for a[i]
1717 template<int i> vint16 insert (const vint16& a, int val);
1718 
1719 /// The sum of all components, returned in all components.
1720 vint16 vreduce_add (const vint16& v);
1721 
1722 // Reduction across all components
1723 int reduce_add (const vint16& v);
1724 int reduce_and (const vint16& v);
1725 int reduce_or  (const vint16& v);
1726 
1727 /// Use a bool mask to select between components of a (if mask[i] is false)
1728 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1729 vint16 blend (const vint16& a, const vint16& b, const vbool16& mask);
1730 
1731 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1732 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1733 /// blend(0,a,mask).
1734 vint16 blend0 (const vint16& a, const vbool16& mask);
1735 
1736 /// Use a bool mask to select between components of a (if mask[i] is false)
1737 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1738 /// blend(0,a,!mask), or blend(a,0,mask).
1739 vint16 blend0not (const vint16& a, const vbool16& mask);
1740 
1741 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1742 /// synonym for blend with arguments rearranged, but this is more clear
1743 /// because the arguments are symmetric to scalar (cond ? a : b).
1744 vint16 select (const vbool16& mask, const vint16& a, const vint16& b);
1745 
1746 // Per-element math
1747 vint16 abs (const vint16& a);
1748 vint16 min (const vint16& a, const vint16& b);
1749 vint16 max (const vint16& a, const vint16& b);
1750 
1751 /// Circular bit rotate by s bits, for N values at once.
1752 vint16 rotl (const vint16& x, const int s);
1753 // DEPRECATED(2.1)
1754 vint16 rotl32 (const vint16& x, const unsigned int k);
1755 
1756 /// andnot(a,b) returns ((~a) & b)
1757 vint16 andnot (const vint16& a, const vint16& b);
1758 
1759 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1760 vint16 bitcast_to_int (const vbool16& x);
1761 vint16 bitcast_to_int (const vfloat16& x);
1762 vfloat16 bitcast_to_float (const vint16& x);
1763 
1764 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1765 vint16 safe_mod (const vint16& a, const vint16& b);
1766 vint16 safe_mod (const vint16& a, int b);
1767 
1768 
1769 
1770 
1771 
1772 /// Floating point 4-vector, accelerated by SIMD instructions when
1773 /// available.
1774 class vfloat4 {
1775 public:
1776     static const char* type_name() { return "vfloat4"; }
1777     typedef float value_t;    ///< Underlying equivalent scalar value type
1778     enum { elements = 4 };    ///< Number of scalar elements
1779     enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
1780     enum { bits = elements*32 }; ///< Total number of bits
1781     typedef simd_raw_t<float,4>::type simd_t;  ///< the native SIMD type used
1782     typedef vfloat4 vfloat_t; ///< SIMD int type
1783     typedef vint4 vint_t;     ///< SIMD int type
1784     typedef vbool4 vbool_t;   ///< SIMD bool type
1785     typedef vint4 int_t;      // old name (deprecated 1.8)
1786     typedef vbool4 bool_t;    // old name (deprecated 1.8)
1787 
1788     /// Default constructor (contents undefined)
1789     vfloat4 () { }
1790 
1791     /// Construct from a single value (store it in all slots)
1792     vfloat4 (float a) { load(a); }
1793 
1794     /// Construct from 3 or 4 values
1795     vfloat4 (float a, float b, float c, float d=0.0f) { load(a,b,c,d); }
1796 
1797     /// Construct from a pointer to 4 values
1798     vfloat4 (const float *f) { load (f); }
1799 
1800     /// Copy construct from another vfloat4
1801     vfloat4 (const vfloat4 &other) { m_simd = other.m_simd; }
1802 
1803     /// Construct from an vint4 (promoting all components to float)
1804     explicit vfloat4 (const vint4& ival);
1805 
1806     /// Construct from the underlying SIMD type
1807     vfloat4 (const simd_t& m) : m_simd(m) { }
1808 
1809     /// Return the raw SIMD type
1810     operator simd_t () const { return m_simd; }
1811     simd_t simd () const { return m_simd; }
1812     simd_t& simd () { return m_simd; }
1813 
1814     /// Return a pointer to the underlying scalar type
1815     const value_t* data () const { return (const value_t*)this; }
1816     value_t* data () { return (value_t*)this; }
1817 
1818     /// Construct from a Imath::V3f
1819     explicit vfloat4 (const Imath::V3f &v) { load (v[0], v[1], v[2]); }
1820 
1821     /// Cast to a Imath::V3f
1822     const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
1823 
1824     /// Construct from a Imath::V4f
1825     explicit vfloat4 (const Imath::V4f &v) { load ((const float *)&v); }
1826 
1827     /// Cast to a Imath::V4f
1828     const Imath::V4f& V4f () const { return *(const Imath::V4f*)this; }
1829 
1830     /// Construct from a pointer to 4 unsigned short values
1831     explicit vfloat4 (const unsigned short *vals) { load(vals); }
1832 
1833     /// Construct from a pointer to 4 short values
1834     explicit vfloat4 (const short *vals) { load(vals); }
1835 
1836     /// Construct from a pointer to 4 unsigned char values
1837     explicit vfloat4 (const unsigned char *vals) { load(vals); }
1838 
1839     /// Construct from a pointer to 4 char values
1840     explicit vfloat4 (const char *vals) { load(vals); }
1841 
1842 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1843     /// Construct from a pointer to 4 half (16 bit float) values
1844     explicit vfloat4 (const half *vals) { load(vals); }
1845 #endif
1846 
1847     /// Assign a single value to all components
1848     const vfloat4 & operator= (float a) { load(a); return *this; }
1849 
1850     /// Assign a vfloat4
1851     const vfloat4 & operator= (vfloat4 other) {
1852         m_simd = other.m_simd;
1853         return *this;
1854     }
1855 
1856     /// Return a vfloat4 with all components set to 0.0
1857     static const vfloat4 Zero ();
1858 
1859     /// Return a vfloat4 with all components set to 1.0
1860     static const vfloat4 One ();
1861 
1862     /// Return a vfloat4 with incremented components (e.g., 0.0,1.0,2.0,3.0).
1863     /// Optional argument can give a non-zero starting point and non-1 step.
1864     static const vfloat4 Iota (float start=0.0f, float step=1.0f);
1865 
1866     /// Set all components to 0.0
1867     void clear ();
1868 
1869     /// Assign from a Imath::V4f
1870     const vfloat4 & operator= (const Imath::V4f &v);
1871 
1872     /// Assign from a Imath::V3f
1873     const vfloat4 & operator= (const Imath::V3f &v);
1874 
1875     /// Component access (get)
1876     float operator[] (int i) const;
1877     /// Component access (set)
1878     float& operator[] (int i);
1879 
1880     /// Component access (set).
1881     void setcomp (int i, float value);
1882 
1883     value_t x () const;
1884     value_t y () const;
1885     value_t z () const;
1886     value_t w () const;
1887     void set_x (value_t val);
1888     void set_y (value_t val);
1889     void set_z (value_t val);
1890     void set_w (value_t val);
1891 
1892     /// Helper: load a single value into all components
1893     void load (float val);
1894 
1895     /// Helper: load 3 or 4 values. (If 3 are supplied, the 4th will be 0.)
1896     void load (float a, float b, float c, float d=0.0f);
1897 
1898     /// Load from an array of 4 values
1899     void load (const float *values);
1900 
1901     /// Load from a partial array of <=4 values. Unassigned values are
1902     /// undefined.
1903     void load (const float *values, int n);
1904 
1905     /// Load from an array of 4 unsigned short values, convert to float
1906     void load (const unsigned short *values);
1907 
1908     /// Load from an array of 4 short values, convert to float
1909     void load (const short *values);
1910 
1911     /// Load from an array of 4 unsigned char values, convert to float
1912     void load (const unsigned char *values);
1913 
1914     /// Load from an array of 4 char values, convert to float
1915     void load (const char *values);
1916 
1917 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1918     /// Load from an array of 4 half values, convert to float
1919     void load (const half *values);
1920 #endif /* _HALF_H_ or _IMATH_H_ */
1921 
1922     void store (float *values) const;
1923 
1924     /// Store the first n values into memory
1925     void store (float *values, int n) const;
1926 
1927 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1928     void store (half *values) const;
1929 #endif
1930 
1931     /// Masked load -- read from values[] where mask is 1, load zero where
1932     /// mask is 0.
1933     void load_mask (int mask, const value_t *values);
1934     void load_mask (const vbool_t& mask, const value_t *values);
1935 
1936     /// Masked store -- write to values[] where mask is enabled, don't
1937     /// touch values[] where it's not.
1938     void store_mask (int mask, value_t *values) const;
1939     void store_mask (const vbool_t& mask, value_t *values) const;
1940 
1941     /// Load values from addresses  (char*)basepatr + vindex[i]*scale
1942     template<int scale=4>
1943     void gather (const value_t *baseptr, const vint_t& vindex);
1944     /// Gather elements defined by the mask, leave others unchanged.
1945     template<int scale=4>
1946     void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1947     template<int scale=4>
1948     void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1949 
1950     /// Store values at addresses  (char*)basepatr + vindex[i]*scale
1951     template<int scale=4>
1952     void scatter (value_t *baseptr, const vint_t& vindex) const;
1953     /// Scatter elements defined by the mask
1954     template<int scale=4>
1955     void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1956     template<int scale=4>
1957     void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1958 
1959     // Arithmetic operators
1960     friend vfloat4 operator+ (const vfloat4& a, const vfloat4& b);
1961     const vfloat4 & operator+= (const vfloat4& a);
1962     vfloat4 operator- () const;
1963     friend vfloat4 operator- (const vfloat4& a, const vfloat4& b);
1964     const vfloat4 & operator-= (const vfloat4& a);
1965     friend vfloat4 operator* (const vfloat4& a, const vfloat4& b);
1966     friend vfloat4 operator* (const vfloat4& a, float b);
1967     friend vfloat4 operator* (float a, const vfloat4& b);
1968     const vfloat4 & operator*= (const vfloat4& a);
1969     const vfloat4 & operator*= (float val);
1970     friend vfloat4 operator/ (const vfloat4& a, const vfloat4& b);
1971     const vfloat4 & operator/= (const vfloat4& a);
1972     const vfloat4 & operator/= (float val);
1973 
1974     // Comparison operations
1975     friend vbool4 operator== (const vfloat4& a, const vfloat4& b);
1976     friend vbool4 operator!= (const vfloat4& a, const vfloat4& b);
1977     friend vbool4 operator<  (const vfloat4& a, const vfloat4& b);
1978     friend vbool4 operator>  (const vfloat4& a, const vfloat4& b);
1979     friend vbool4 operator>= (const vfloat4& a, const vfloat4& b);
1980     friend vbool4 operator<= (const vfloat4& a, const vfloat4& b);
1981 
1982     // Some oddball items that are handy
1983 
1984     /// Combine the first two components of A with the first two components
1985     /// of B.
1986     friend vfloat4 AxyBxy (const vfloat4& a, const vfloat4& b);
1987 
1988     /// Combine the first two components of A with the first two components
1989     /// of B, but interleaved.
1990     friend vfloat4 AxBxAyBy (const vfloat4& a, const vfloat4& b);
1991 
1992     /// Return xyz components, plus 0 for w
1993     vfloat4 xyz0 () const;
1994 
1995     /// Return xyz components, plus 1 for w
1996     vfloat4 xyz1 () const;
1997 
1998     /// Stream output
1999     friend inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val);
2000 
2001 protected:
2002     // The actual data representation
2003     union {
2004         simd_t  m_simd;
2005         value_t m_val[paddedelements];
2006     };
2007 };
2008 
2009 
2010 /// Helper: shuffle/swizzle with constant (templated) indices.
2011 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2012 template<int i0, int i1, int i2, int i3> vfloat4 shuffle (const vfloat4& a);
2013 
2014 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2015 template<int i> vfloat4 shuffle (const vfloat4& a);
2016 
2017 /// Helper: as rapid as possible extraction of one component, when the
2018 /// index is fixed.
2019 template<int i> float extract (const vfloat4& a);
2020 
2021 /// Helper: substitute val for a[i]
2022 template<int i> vfloat4 insert (const vfloat4& a, float val);
2023 
2024 /// The sum of all components, returned in all components.
2025 vfloat4 vreduce_add (const vfloat4& v);
2026 
2027 /// The sum of all components, returned as a scalar.
2028 float reduce_add (const vfloat4& v);
2029 
2030 /// Return the float dot (inner) product of a and b in every component.
2031 vfloat4 vdot (const vfloat4 &a, const vfloat4 &b);
2032 
2033 /// Return the float dot (inner) product of a and b.
2034 float dot (const vfloat4 &a, const vfloat4 &b);
2035 
2036 /// Return the float 3-component dot (inner) product of a and b in
2037 /// all components.
2038 vfloat4 vdot3 (const vfloat4 &a, const vfloat4 &b);
2039 
2040 /// Return the float 3-component dot (inner) product of a and b.
2041 float dot3 (const vfloat4 &a, const vfloat4 &b);
2042 
2043 /// Use a bool mask to select between components of a (if mask[i] is false)
2044 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2045 vfloat4 blend (const vfloat4& a, const vfloat4& b, const vbool4& mask);
2046 
2047 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2048 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2049 /// blend(0,a,mask).
2050 vfloat4 blend0 (const vfloat4& a, const vbool4& mask);
2051 
2052 /// Use a bool mask to select between components of a (if mask[i] is false)
2053 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2054 /// blend(0,a,!mask), or blend(a,0,mask).
2055 vfloat4 blend0not (const vfloat4& a, const vbool4& mask);
2056 
2057 /// "Safe" divide of vfloat4/vfloat4 -- for any component of the divisor
2058 /// that is 0, return 0 rather than Inf.
2059 vfloat4 safe_div (const vfloat4 &a, const vfloat4 &b);
2060 
2061 /// Homogeneous divide to turn a vfloat4 into a vfloat3.
2062 vfloat3 hdiv (const vfloat4 &a);
2063 
2064 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2065 /// synonym for blend with arguments rearranged, but this is more clear
2066 /// because the arguments are symmetric to scalar (cond ? a : b).
2067 vfloat4 select (const vbool4& mask, const vfloat4& a, const vfloat4& b);
2068 
2069 // Per-element math
2070 vfloat4 abs (const vfloat4& a);    ///< absolute value (float)
2071 vfloat4 sign (const vfloat4& a);   ///< 1.0 when value >= 0, -1 when negative
2072 vfloat4 ceil (const vfloat4& a);
2073 vfloat4 floor (const vfloat4& a);
2074 vint4 ifloor (const vfloat4& a);    ///< (int)floor
2075 inline vint4 floori (const vfloat4& a) { return ifloor(a); }  // DEPRECATED(1.8) alias
2076 
2077 /// Per-element round to nearest integer.
2078 /// CAVEAT: the rounding when mid-way between integers may differ depending
2079 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
2080 /// integer) but std::round() says to round away from 0 regardless of
2081 /// current rounding mode (but that is multiple instructions on x64).
2082 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2083 /// match std::round().
2084 vfloat4 round (const vfloat4& a);
2085 
2086 /// Per-element round to nearest integer (equivalent to vint(round(a))).
2087 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
2088 /// C++ std::rint() which says to use the current rounding mode.
2089 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2090 /// match std::rint().
2091 vint4 rint (const vfloat4& a);
2092 
2093 vfloat4 rcp_fast (const vfloat4 &a);  ///< Fast, approximate 1/a
2094 vfloat4 sqrt (const vfloat4 &a);
2095 vfloat4 rsqrt (const vfloat4 &a);   ///< Fully accurate 1/sqrt
2096 vfloat4 rsqrt_fast (const vfloat4 &a);  ///< Fast, approximate 1/sqrt
2097 vfloat4 min (const vfloat4& a, const vfloat4& b); ///< Per-element min
2098 vfloat4 max (const vfloat4& a, const vfloat4& b); ///< Per-element max
2099 template <typename T> T exp (const T& v);  // template for all SIMD variants
2100 template <typename T> T log (const T& v);
2101 
2102 /// andnot(a,b) returns ((~a) & b)
2103 vfloat4 andnot (const vfloat4& a, const vfloat4& b);
2104 
2105 // Fused multiply and add (or subtract):
2106 vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b + c
2107 vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b - c
2108 vfloat4 nmadd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b + c
2109 vfloat4 nmsub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b - c
2110 
2111 /// Transpose the rows and columns of the 4x4 matrix [a b c d].
2112 /// In the end, a will have the original (a[0], b[0], c[0], d[0]),
2113 /// b will have the original (a[1], b[1], c[1], d[1]), and so on.
2114 void transpose (vfloat4 &a, vfloat4 &b, vfloat4 &c, vfloat4 &d);
2115 void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d,
2116                 vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3);
2117 
2118 /// Make a vfloat4 consisting of the first element of each of 4 vfloat4's.
2119 vfloat4 AxBxCxDx (const vfloat4& a, const vfloat4& b,
2120                  const vfloat4& c, const vfloat4& d);
2121 
2122 
2123 
2124 /// Floating point 3-vector, aligned to be internally identical to a vfloat4.
2125 /// The way it differs from vfloat4 is that all of he load functions only
2126 /// load three values, and all the stores only store 3 values. The vast
2127 /// majority of ops just fall back to the vfloat4 version, and so will
2128 /// operate on the 4th component, but we won't care about that results.
2129 class vfloat3 : public vfloat4 {
2130 public:
2131     static const char* type_name() { return "vfloat3"; }
2132     enum { elements = 3 };    ///< Number of scalar elements
2133     enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
2134 
2135     /// Default constructor (contents undefined)
2136     vfloat3 () { }
2137 
2138     /// Construct from a single value (store it in all slots)
2139     vfloat3 (float a) { load(a); }
2140 
2141     /// Construct from 3 or 4 values
2142     vfloat3 (float a, float b, float c) { vfloat4::load(a,b,c); }
2143 
2144     /// Construct from a pointer to 4 values
2145     vfloat3 (const float *f) { load (f); }
2146 
2147     /// Copy construct from another vfloat3
2148     vfloat3 (const vfloat3 &other);
2149 
2150     /// Construct from a vfloat4. Note: it will not zero out the internal
2151     /// 4th component, but rather accept on faith that the vfloat4 you are
2152     /// giving it is a valid vfloat3. Be careful!
2153     explicit vfloat3 (const vfloat4 &other);
2154 
2155 #if OIIO_SIMD
2156     /// Construct from the underlying SIMD type. Note: it will not zero out
2157     /// the internal 4th component, but rather accept on faith that the
2158     /// vfloat4 you are giving it is a valid vfloat3. Be careful!
2159     explicit vfloat3 (const simd_t& m) : vfloat4(m) { }
2160 #endif
2161 
2162     /// Construct from a Imath::V3f
2163     vfloat3 (const Imath::V3f &v) : vfloat4(v) { }
2164 
2165     /// Cast to a Imath::V3f
2166     const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
2167 
2168     /// Construct from a pointer to 4 unsigned short values
2169     explicit vfloat3 (const unsigned short *vals) { load(vals); }
2170 
2171     /// Construct from a pointer to 4 short values
2172     explicit vfloat3 (const short *vals) { load(vals); }
2173 
2174     /// Construct from a pointer to 4 unsigned char values
2175     explicit vfloat3 (const unsigned char *vals) { load(vals); }
2176 
2177     /// Construct from a pointer to 4 char values
2178     explicit vfloat3 (const char *vals) { load(vals); }
2179 
2180 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2181     /// Construct from a pointer to 4 half (16 bit float) values
2182     explicit vfloat3 (const half *vals) { load(vals); }
2183 #endif
2184 
2185     /// Assign a single value to all components
2186     const vfloat3 & operator= (float a) { load(a); return *this; }
2187 
2188     /// Return a vfloat3 with all components set to 0.0
2189     static const vfloat3 Zero ();
2190 
2191     /// Return a vfloat3 with all components set to 1.0
2192     static const vfloat3 One ();
2193 
2194     /// Return a vfloat3 with incremented components (e.g., 0.0,1.0,2.0).
2195     /// Optional argument can give a non-zero starting point and non-1 step.
2196     static const vfloat3 Iota (float start=0.0f, float step=1.0f);
2197 
2198     /// Helper: load a single value into all components
2199     void load (float val);
2200 
2201     /// Load from an array of 4 values
2202     void load (const float *values);
2203 
2204     /// Load from an array of 4 values
2205     void load (const float *values, int n);
2206 
2207     /// Load from an array of 4 unsigned short values, convert to float
2208     void load (const unsigned short *values);
2209 
2210     /// Load from an array of 4 short values, convert to float
2211     void load (const short *values);
2212 
2213     /// Load from an array of 4 unsigned char values, convert to float
2214     void load (const unsigned char *values);
2215 
2216     /// Load from an array of 4 char values, convert to float
2217     void load (const char *values);
2218 
2219 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2220     /// Load from an array of 4 half values, convert to float
2221     void load (const half *values);
2222 #endif /* _HALF_H_ or _IMATH_H_ */
2223 
2224     void store (float *values) const;
2225 
2226     void store (float *values, int n) const;
2227 
2228 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2229     void store (half *values) const;
2230 #endif
2231 
2232     /// Store into an Imath::V3f reference.
2233     void store (Imath::V3f &vec) const;
2234 
2235     // Math operators -- define in terms of vfloat3.
2236     friend vfloat3 operator+ (const vfloat3& a, const vfloat3& b);
2237     const vfloat3 & operator+= (const vfloat3& a);
2238     vfloat3 operator- () const;
2239     friend vfloat3 operator- (const vfloat3& a, const vfloat3& b);
2240     const vfloat3 & operator-= (const vfloat3& a);
2241     friend vfloat3 operator* (const vfloat3& a, const vfloat3& b);
2242     friend vfloat3 operator* (const vfloat3& a, float b);
2243     friend vfloat3 operator* (float a, const vfloat3& b);
2244     const vfloat3 & operator*= (const vfloat3& a);
2245     const vfloat3 & operator*= (float a);
2246     friend vfloat3 operator/ (const vfloat3& a, const vfloat3& b);
2247     const vfloat3 & operator/= (const vfloat3& a);
2248     const vfloat3 & operator/= (float a);
2249 
2250     /// Square of the length of the vector
2251     float length2() const;
2252     /// Length of the vector
2253     float length() const;
2254 
2255     /// Return a normalized version of the vector.
2256     vfloat3 normalized () const;
2257     /// Return a fast, approximate normalized version of the vector.
2258     vfloat3 normalized_fast () const;
2259     /// Normalize in place.
2260     void normalize() { *this = normalized(); }
2261 
2262     /// Stream output
2263     friend inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val);
2264 };
2265 
2266 
2267 
2268 // Per-element math on float3
2269 vfloat3 abs (const vfloat3& a);
2270 vfloat3 sign (const vfloat3& a);
2271 vfloat3 ceil (const vfloat3& a);
2272 vfloat3 floor (const vfloat3& a);
2273 vfloat3 round (const vfloat3& a);
2274 
2275 
2276 
2277 /// SIMD-based 4x4 matrix. This is guaranteed to have memory layout (when
2278 /// not in registers) isomorphic to Imath::M44f.
2279 class matrix44 {
2280 public:
2281     // Uninitialized
2282     OIIO_FORCEINLINE matrix44 ()
2283 #ifndef OIIO_SIMD_SSE
2284         : m_mat(Imath::UNINITIALIZED)
2285 #endif
2286     { }
2287 
2288     /// Construct from a reference to an Imath::M44f
2289     OIIO_FORCEINLINE explicit matrix44 (const Imath::M44f &M) {
2290 #if OIIO_SIMD_SSE
2291         m_row[0].load (M[0]);
2292         m_row[1].load (M[1]);
2293         m_row[2].load (M[2]);
2294         m_row[3].load (M[3]);
2295 #else
2296         m_mat = M;
2297 #endif
2298     }
2299 
2300     /// Construct from a float array
2301     OIIO_FORCEINLINE explicit matrix44 (const float *f) {
2302 #if OIIO_SIMD_SSE
2303         m_row[0].load (f+0);
2304         m_row[1].load (f+4);
2305         m_row[2].load (f+8);
2306         m_row[3].load (f+12);
2307 #else
2308         m_mat = *(const Imath::M44f*)f;
2309 #endif
2310     }
2311 
2312     /// Construct from 4 vfloat4 rows
2313     OIIO_FORCEINLINE explicit matrix44 (const vfloat4& a, const vfloat4& b,
2314                                         const vfloat4& c, const vfloat4& d) {
2315 #if OIIO_SIMD_SSE
2316         m_row[0] = a; m_row[1] = b; m_row[2] = c; m_row[3] = d;
2317 #else
2318         a.store (m_mat[0]);
2319         b.store (m_mat[1]);
2320         c.store (m_mat[2]);
2321         d.store (m_mat[3]);
2322 #endif
2323     }
2324     /// Construct from 4 float[4] rows
2325     OIIO_FORCEINLINE explicit matrix44 (const float *a, const float *b,
2326                                         const float *c, const float *d) {
2327 #if OIIO_SIMD_SSE
2328         m_row[0].load(a); m_row[1].load(b); m_row[2].load(c); m_row[3].load(d);
2329 #else
2330         memcpy (m_mat[0], a, 4*sizeof(float));
2331         memcpy (m_mat[1], b, 4*sizeof(float));
2332         memcpy (m_mat[2], c, 4*sizeof(float));
2333         memcpy (m_mat[3], d, 4*sizeof(float));
2334 #endif
2335     }
2336 
2337     /// Construct from 16 floats
2338     OIIO_FORCEINLINE matrix44 (float f00, float f01, float f02, float f03,
2339                                float f10, float f11, float f12, float f13,
2340                                float f20, float f21, float f22, float f23,
2341                                float f30, float f31, float f32, float f33)
2342     {
2343 #if OIIO_SIMD_SSE
2344         m_row[0].load (f00, f01, f02, f03);
2345         m_row[1].load (f10, f11, f12, f13);
2346         m_row[2].load (f20, f21, f22, f23);
2347         m_row[3].load (f30, f31, f32, f33);
2348 #else
2349         m_mat[0][0] = f00; m_mat[0][1] = f01; m_mat[0][2] = f02; m_mat[0][3] = f03;
2350         m_mat[1][0] = f10; m_mat[1][1] = f11; m_mat[1][2] = f12; m_mat[1][3] = f13;
2351         m_mat[2][0] = f20; m_mat[2][1] = f21; m_mat[2][2] = f22; m_mat[2][3] = f23;
2352         m_mat[3][0] = f30; m_mat[3][1] = f31; m_mat[3][2] = f32; m_mat[3][3] = f33;
2353 #endif
2354     }
2355 
2356     /// Present as an Imath::M44f
2357     const Imath::M44f& M44f() const;
2358 
2359     /// Return one row
2360     vfloat4 operator[] (int i) const;
2361 
2362     /// Return the transposed matrix
2363     matrix44 transposed () const;
2364 
2365     /// Transform 3-point V by 4x4 matrix M.
2366     vfloat3 transformp (const vfloat3 &V) const;
2367 
2368     /// Transform 3-vector V by 4x4 matrix M.
2369     vfloat3 transformv (const vfloat3 &V) const;
2370 
2371     /// Transform 3-vector V by the transpose of 4x4 matrix M.
2372     vfloat3 transformvT (const vfloat3 &V) const;
2373 
2374     friend vfloat4 operator* (const vfloat4 &V, const matrix44& M);
2375     friend vfloat4 operator* (const matrix44& M, const vfloat4 &V);
2376 
2377     bool operator== (const matrix44& m) const;
2378 
2379     bool operator== (const Imath::M44f& m) const ;
2380     friend bool operator== (const Imath::M44f& a, const matrix44 &b);
2381 
2382     bool operator!= (const matrix44& m) const;
2383 
2384     bool operator!= (const Imath::M44f& m) const;
2385     friend bool operator!= (const Imath::M44f& a, const matrix44 &b);
2386 
2387     /// Return the inverse of the matrix.
2388     matrix44 inverse() const;
2389 
2390     /// Stream output
2391     friend inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M);
2392 
2393 private:
2394 #if OIIO_SIMD_SSE
2395     vfloat4 m_row[4];
2396 #else
2397     Imath::M44f m_mat;
2398 #endif
2399 };
2400 
2401 /// Transform 3-point V by 4x4 matrix M.
2402 vfloat3 transformp (const matrix44 &M, const vfloat3 &V);
2403 vfloat3 transformp (const Imath::M44f &M, const vfloat3 &V);
2404 
2405 /// Transform 3-vector V by 4x4 matrix M.
2406 vfloat3 transformv (const matrix44 &M, const vfloat3 &V);
2407 vfloat3 transformv (const Imath::M44f &M, const vfloat3 &V);
2408 
2409 // Transform 3-vector by the transpose of 4x4 matrix M.
2410 vfloat3 transformvT (const matrix44 &M, const vfloat3 &V);
2411 vfloat3 transformvT (const Imath::M44f &M, const vfloat3 &V);
2412 
2413 
2414 
2415 
2416 /// Floating point 8-vector, accelerated by SIMD instructions when
2417 /// available.
2418 class vfloat8 {
2419 public:
2420     static const char* type_name() { return "vfloat8"; }
2421     typedef float value_t;    ///< Underlying equivalent scalar value type
2422     enum { elements = 8 };    ///< Number of scalar elements
2423     enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
2424     enum { bits = elements*32 }; ///< Total number of bits
2425     typedef simd_raw_t<float,8>::type simd_t;  ///< the native SIMD type used
2426     typedef vfloat8 vfloat_t; ///< SIMD int type
2427     typedef vint8 vint_t;     ///< SIMD int type
2428     typedef vbool8 vbool_t;   ///< SIMD bool type
2429     typedef vint8 int_t;      // old name (deprecated 1.8)
2430     typedef vbool8 bool_t;    // old name (deprecated 1.8)
2431 
2432     /// Default constructor (contents undefined)
2433     vfloat8 () { }
2434 
2435     /// Construct from a single value (store it in all slots)
2436     vfloat8 (float a) { load(a); }
2437 
2438     /// Construct from 8 values
2439     vfloat8 (float a, float b, float c, float d,
2440             float e, float f, float g, float h) { load(a,b,c,d,e,f,g,h); }
2441 
2442     /// Construct from a pointer to 8 values
2443     vfloat8 (const float *f) { load (f); }
2444 
2445     /// Copy construct from another vfloat8
2446     vfloat8 (const vfloat8 &other) { m_simd = other.m_simd; }
2447 
2448     /// Construct from an int vector (promoting all components to float)
2449     explicit vfloat8 (const vint8& ival);
2450 
2451     /// Construct from two vfloat4's
2452     vfloat8 (const vfloat4 &lo, const vfloat4 &hi);
2453 
2454     /// Construct from the underlying SIMD type
2455     vfloat8 (const simd_t& m) : m_simd(m) { }
2456 
2457     /// Return the raw SIMD type
2458     operator simd_t () const { return m_simd; }
2459     simd_t simd () const { return m_simd; }
2460     simd_t& simd () { return m_simd; }
2461 
2462     /// Return a pointer to the underlying scalar type
2463     const value_t* data () const { return (const value_t*)this; }
2464     value_t* data () { return (value_t*)this; }
2465 
2466     /// Construct from a pointer to unsigned short values
2467     explicit vfloat8 (const unsigned short *vals) { load(vals); }
2468 
2469     /// Construct from a pointer to short values
2470     explicit vfloat8 (const short *vals) { load(vals); }
2471 
2472     /// Construct from a pointer to unsigned char values
2473     explicit vfloat8 (const unsigned char *vals) { load(vals); }
2474 
2475     /// Construct from a pointer to char values
2476     explicit vfloat8 (const char *vals) { load(vals); }
2477 
2478 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2479     /// Construct from a pointer to half (16 bit float) values
2480     explicit vfloat8 (const half *vals) { load(vals); }
2481 #endif
2482 
2483     /// Assign a single value to all components
2484     const vfloat8& operator= (float a) { load(a); return *this; }
2485 
2486     /// Assign a vfloat8
2487     const vfloat8& operator= (vfloat8 other) {
2488         m_simd = other.m_simd;
2489         return *this;
2490     }
2491 
2492     /// Return a vfloat8 with all components set to 0.0
2493     static const vfloat8 Zero ();
2494 
2495     /// Return a vfloat8 with all components set to 1.0
2496     static const vfloat8 One ();
2497 
2498     /// Return a vfloat8 with incremented components (e.g., 0,1,2,3,...)
2499     /// Optional argument can give a non-zero starting point and non-1 step.
2500     static const vfloat8 Iota (float start=0.0f, float step=1.0f);
2501 
2502     /// Set all components to 0.0
2503     void clear ();
2504 
2505     /// Component access (get)
2506     float operator[] (int i) const;
2507     /// Component access (set)
2508     float& operator[] (int i);
2509 
2510     /// Component access (set).
2511     void setcomp (int i, float value);
2512 
2513     value_t x () const;
2514     value_t y () const;
2515     value_t z () const;
2516     value_t w () const;
2517     void set_x (value_t val);
2518     void set_y (value_t val);
2519     void set_z (value_t val);
2520     void set_w (value_t val);
2521 
2522     /// Extract the lower precision vfloat4
2523     vfloat4 lo () const;
2524 
2525     /// Extract the higher precision vfloat4
2526     vfloat4 hi () const;
2527 
2528     /// Helper: load a single value into all components
2529     void load (float val);
2530 
2531     /// Helper: load 8 values
2532     void load (float a, float b, float c, float d,
2533                float e, float f, float g, float h);
2534 
2535     /// Load from an array of values
2536     void load (const float *values);
2537 
2538     /// Load from a partial array of <=8 values. Unassigned values are
2539     /// undefined.
2540     void load (const float *values, int n);
2541 
2542     /// Load from an array of 8 unsigned short values, convert to float
2543     void load (const unsigned short *values);
2544 
2545     /// Load from an array of 8 short values, convert to float
2546     void load (const short *values);
2547 
2548     /// Load from an array of 8 unsigned char values, convert to float
2549     void load (const unsigned char *values);
2550 
2551     /// Load from an array of 8 char values, convert to float
2552     void load (const char *values);
2553 
2554 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2555     /// Load from an array of 8 half values, convert to float
2556     void load (const half *values);
2557 #endif /* _HALF_H_ or _IMATH_H_ */
2558 
2559     void store (float *values) const;
2560 
2561     /// Store the first n values into memory
2562     void store (float *values, int n) const;
2563 
2564 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2565     void store (half *values) const;
2566 #endif
2567 
2568     /// Masked load -- read from values[] where mask is 1, load zero where
2569     /// mask is 0.
2570     void load_mask (int mask, const value_t *values);
2571     void load_mask (const vbool_t& mask, const value_t *values);
2572 
2573     /// Masked store -- write to values[] where mask is enabled, don't
2574     /// touch values[] where it's not.
2575     void store_mask (int mask, value_t *values) const;
2576     void store_mask (const vbool_t& mask, value_t *values) const;
2577 
2578     /// Load values from addresses  (char*)basepatr + vindex[i]*scale
2579     template<int scale=4>
2580     void gather (const value_t *baseptr, const vint_t& vindex);
2581     template<int scale=4>
2582     // Fastest way to fill with all 1 bits is to cmp any value to itself.
2583     void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
2584     template<int scale=4>
2585     void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
2586 
2587     /// Store values at addresses  (char*)basepatr + vindex[i]*scale
2588     template<int scale=4>
2589     void scatter (value_t *baseptr, const vint_t& vindex) const;
2590     /// Scatter elements defined by the mask
2591     template<int scale=4>
2592     void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2593     template<int scale=4>
2594     void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
2595 
2596     // Arithmetic operators (component-by-component)
2597     friend vfloat8 operator+ (const vfloat8& a, const vfloat8& b);
2598     friend vfloat8 operator- (const vfloat8& a);
2599     friend vfloat8 operator- (const vfloat8& a, const vfloat8& b);
2600     friend vfloat8 operator* (const vfloat8& a, const vfloat8& b);
2601     friend vfloat8 operator* (const vfloat8& a, float b);
2602     friend vfloat8 operator* (float a, const vfloat8& b);
2603     friend vfloat8 operator/ (const vfloat8& a, const vfloat8& b);
2604     friend vfloat8 operator% (const vfloat8& a, const vfloat8& b);
2605     friend const vfloat8 & operator+= (vfloat8& a, const vfloat8& b);
2606     friend const vfloat8 & operator-= (vfloat8& a, const vfloat8& b);
2607     friend const vfloat8 & operator*= (vfloat8& a, const vfloat8& b);
2608     friend const vfloat8 & operator/= (vfloat8& a, const vfloat8& b);
2609 
2610     // Comparison operations
2611     friend vbool8 operator== (const vfloat8& a, const vfloat8& b);
2612     friend vbool8 operator!= (const vfloat8& a, const vfloat8& b);
2613     friend vbool8 operator<  (const vfloat8& a, const vfloat8& b);
2614     friend vbool8 operator>  (const vfloat8& a, const vfloat8& b);
2615     friend vbool8 operator>= (const vfloat8& a, const vfloat8& b);
2616     friend vbool8 operator<= (const vfloat8& a, const vfloat8& b);
2617 
2618     // Some oddball items that are handy
2619 
2620     /// Stream output
2621     friend inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val);
2622 
2623 protected:
2624     // The actual data representation
2625     union {
2626         simd_t  m_simd;
2627         value_t m_val[paddedelements];
2628         vfloat4 m_4[2];
2629     };
2630 };
2631 
2632 
2633 /// Helper: shuffle/swizzle with constant (templated) indices.
2634 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2635 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
2636 vfloat8 shuffle (const vfloat8& a);
2637 
2638 /// shuffle<i>(a) is the same as shuffle<i,i,i,i,...>(a)
2639 template<int i> vfloat8 shuffle (const vfloat8& a);
2640 
2641 /// Helper: as rapid as possible extraction of one component, when the
2642 /// index is fixed.
2643 template<int i> float extract (const vfloat8& a);
2644 
2645 /// Helper: substitute val for a[i]
2646 template<int i> vfloat8 insert (const vfloat8& a, float val);
2647 
2648 /// The sum of all components, returned in all components.
2649 vfloat8 vreduce_add (const vfloat8& v);
2650 
2651 /// The sum of all components, returned as a scalar.
2652 float reduce_add (const vfloat8& v);
2653 
2654 /// Return the float dot (inner) product of a and b in every component.
2655 vfloat8 vdot (const vfloat8 &a, const vfloat8 &b);
2656 
2657 /// Return the float dot (inner) product of a and b.
2658 float dot (const vfloat8 &a, const vfloat8 &b);
2659 
2660 /// Return the float 3-component dot (inner) product of a and b in
2661 /// all components.
2662 vfloat8 vdot3 (const vfloat8 &a, const vfloat8 &b);
2663 
2664 /// Return the float 3-component dot (inner) product of a and b.
2665 float dot3 (const vfloat8 &a, const vfloat8 &b);
2666 
2667 /// Use a bool mask to select between components of a (if mask[i] is false)
2668 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2669 vfloat8 blend (const vfloat8& a, const vfloat8& b, const vbool8& mask);
2670 
2671 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2672 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2673 /// blend(0,a,mask).
2674 vfloat8 blend0 (const vfloat8& a, const vbool8& mask);
2675 
2676 /// Use a bool mask to select between components of a (if mask[i] is false)
2677 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2678 /// blend(0,a,!mask), or blend(a,0,mask).
2679 vfloat8 blend0not (const vfloat8& a, const vbool8& mask);
2680 
2681 /// "Safe" divide of vfloat8/vfloat8 -- for any component of the divisor
2682 /// that is 0, return 0 rather than Inf.
2683 vfloat8 safe_div (const vfloat8 &a, const vfloat8 &b);
2684 
2685 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2686 /// synonym for blend with arguments rearranged, but this is more clear
2687 /// because the arguments are symmetric to scalar (cond ? a : b).
2688 vfloat8 select (const vbool8& mask, const vfloat8& a, const vfloat8& b);
2689 
2690 // Per-element math
2691 vfloat8 abs (const vfloat8& a);    ///< absolute value (float)
2692 vfloat8 sign (const vfloat8& a);   ///< 1.0 when value >= 0, -1 when negative
2693 vfloat8 ceil (const vfloat8& a);
2694 vfloat8 floor (const vfloat8& a);
2695 vint8 ifloor (const vfloat8& a);    ///< (int)floor
2696 inline vint8 floori (const vfloat8& a) { return ifloor(a); }  // DEPRECATED(1.8) alias
2697 
2698 /// Per-element round to nearest integer.
2699 /// CAVEAT: the rounding when mid-way between integers may differ depending
2700 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
2701 /// integer) but std::round() says to round away from 0 regardless of
2702 /// current rounding mode (but that is multiple instructions on x64).
2703 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2704 /// match std::round().
2705 vfloat8 round (const vfloat8& a);
2706 
2707 /// Per-element round to nearest integer (equivalent to vint(round(a))).
2708 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
2709 /// C++ std::rint() which says to use the current rounding mode.
2710 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2711 /// match std::rint().
2712 vint8 rint (const vfloat8& a);
2713 
2714 vfloat8 rcp_fast (const vfloat8 &a);  ///< Fast, approximate 1/a
2715 vfloat8 sqrt (const vfloat8 &a);
2716 vfloat8 rsqrt (const vfloat8 &a);   ///< Fully accurate 1/sqrt
2717 vfloat8 rsqrt_fast (const vfloat8 &a);  ///< Fast, approximate 1/sqrt
2718 vfloat8 min (const vfloat8& a, const vfloat8& b); ///< Per-element min
2719 vfloat8 max (const vfloat8& a, const vfloat8& b); ///< Per-element max
2720 // vfloat8 exp (const vfloat8& v);  // See template with vfloat4
2721 // vfloat8 log (const vfloat8& v);  // See template with vfloat4
2722 
2723 /// andnot(a,b) returns ((~a) & b)
2724 vfloat8 andnot (const vfloat8& a, const vfloat8& b);
2725 
2726 // Fused multiply and add (or subtract):
2727 vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b + c
2728 vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b - c
2729 vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b + c
2730 vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b - c
2731 
2732 
2733 
2734 /// Floating point 16-vector, accelerated by SIMD instructions when
2735 /// available.
2736 class vfloat16 {
2737 public:
2738     static const char* type_name() { return "vfloat16"; }
2739     typedef float value_t;    ///< Underlying equivalent scalar value type
2740     enum { elements = 16 };    ///< Number of scalar elements
2741     enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
2742     enum { bits = elements*32 }; ///< Total number of bits
2743     typedef simd_raw_t<float,16>::type simd_t;  ///< the native SIMD type used
2744     typedef vfloat16 vfloat_t; ///< SIMD int type
2745     typedef vint16 vint_t;     ///< SIMD int type
2746     typedef vbool16 vbool_t;   ///< SIMD bool type
2747     typedef vint16 int_t;      // old name (deprecated 1.8)
2748     typedef vbool16 bool_t;    // old name (deprecated 1.8)
2749 
2750     /// Default constructor (contents undefined)
2751     vfloat16 () { }
2752 
2753     /// Construct from a single value (store it in all slots)
2754     vfloat16 (float a) { load(a); }
2755 
2756     /// Construct from 16 values
2757     vfloat16 (float v0, float v1, float v2, float v3,
2758              float v4, float v5, float v6, float v7,
2759              float v8, float v9, float v10, float v11,
2760              float v12, float v13, float v14, float v15);
2761 
2762     /// Construct from a pointer to 16 values
2763     vfloat16 (const float *f) { load (f); }
2764 
2765     /// Copy construct from another vfloat16
2766     vfloat16 (const vfloat16 &other) { m_simd = other.m_simd; }
2767 
2768     /// Construct from an int vector (promoting all components to float)
2769     explicit vfloat16 (const vint16& ival);
2770 
2771     /// Construct from two vfloat8's
2772     vfloat16 (const vfloat8 &lo, const vfloat8 &hi);
2773 
2774     /// Construct from four vfloat4's
2775     vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d);
2776 
2777     /// Construct from the underlying SIMD type
2778     vfloat16 (const simd_t& m) : m_simd(m) { }
2779 
2780     /// Return the raw SIMD type
2781     operator simd_t () const { return m_simd; }
2782     simd_t simd () const { return m_simd; }
2783     simd_t& simd () { return m_simd; }
2784 
2785     /// Return a pointer to the underlying scalar type
2786     const value_t* data () const { return (const value_t*)this; }
2787     value_t* data () { return (value_t*)this; }
2788 
2789     /// Construct from a pointer to unsigned short values
2790     explicit vfloat16 (const unsigned short *vals) { load(vals); }
2791 
2792     /// Construct from a pointer to short values
2793     explicit vfloat16 (const short *vals) { load(vals); }
2794 
2795     /// Construct from a pointer to unsigned char values
2796     explicit vfloat16 (const unsigned char *vals) { load(vals); }
2797 
2798     /// Construct from a pointer to char values
2799     explicit vfloat16 (const char *vals) { load(vals); }
2800 
2801 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2802     /// Construct from a pointer to half (16 bit float) values
2803     explicit vfloat16 (const half *vals) { load(vals); }
2804 #endif
2805 
2806     /// Assign a single value to all components
2807     const vfloat16& operator= (float a) { load(a); return *this; }
2808 
2809     /// Assign a vfloat16
2810     const vfloat16& operator= (vfloat16 other) {
2811         m_simd = other.m_simd;
2812         return *this;
2813     }
2814 
2815     /// Return a vfloat16 with all components set to 0.0
2816     static const vfloat16 Zero ();
2817 
2818     /// Return a vfloat16 with all components set to 1.0
2819     static const vfloat16 One ();
2820 
2821     /// Return a vfloat16 with incremented components (e.g., 0,1,2,3,...)
2822     /// Optional argument can give a non-zero starting point and non-1 step.
2823     static const vfloat16 Iota (float start=0.0f, float step=1.0f);
2824 
2825     /// Set all components to 0.0
2826     void clear ();
2827 
2828     /// Component access (get)
2829     float operator[] (int i) const;
2830     /// Component access (set)
2831     float& operator[] (int i);
2832 
2833     /// Component access (set).
2834     void setcomp (int i, float value);
2835 
2836     value_t x () const;
2837     value_t y () const;
2838     value_t z () const;
2839     value_t w () const;
2840     void set_x (value_t val);
2841     void set_y (value_t val);
2842     void set_z (value_t val);
2843     void set_w (value_t val);
2844 
2845     /// Extract the lower precision vfloat8
2846     vfloat8 lo () const;
2847 
2848     /// Extract the higher precision vfloat8
2849     vfloat8 hi () const;
2850 
2851     /// Helper: load a single value into all components
2852     void load (float val);
2853 
2854     /// Load separate values into each component.
2855     void load (float v0, float v1, float v2, float v3,
2856                float v4, float v5, float v6, float v7,
2857                float v8, float v9, float v10, float v11,
2858                float v12, float v13, float v14, float v15);
2859 
2860     /// Load from an array of values
2861     void load (const float *values);
2862 
2863     /// Load from a partial array of <=16 values. Unassigned values are
2864     /// undefined.
2865     void load (const float *values, int n);
2866 
2867     /// Load from an array of 16 unsigned short values, convert to float
2868     void load (const unsigned short *values);
2869 
2870     /// Load from an array of 16 short values, convert to float
2871     void load (const short *values);
2872 
2873     /// Load from an array of 16 unsigned char values, convert to float
2874     void load (const unsigned char *values);
2875 
2876     /// Load from an array of 16 char values, convert to float
2877     void load (const char *values);
2878 
2879 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2880     /// Load from an array of 16 half values, convert to float
2881     void load (const half *values);
2882 #endif /* _HALF_H_ or _IMATH_H_ */
2883 
2884     void store (float *values) const;
2885 
2886     /// Store the first n values into memory
2887     void store (float *values, int n) const;
2888 
2889 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2890     void store (half *values) const;
2891 #endif
2892 
2893     /// Masked load -- read from values[] where mask is 1, load zero where
2894     /// mask is 0.
2895     void load_mask (const vbool_t &mask, const value_t *values);
2896     void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
2897 
2898     /// Masked store -- write to values[] where mask is enabled, don't
2899     /// touch values[] where it's not.
2900     void store_mask (const vbool_t &mask, value_t *values) const;
2901     void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
2902 
2903     /// Load values from addresses  (char*)basepatr + vindex[i]*scale
2904     template<int scale=4>
2905     void gather (const value_t *baseptr, const vint_t& vindex);
2906     /// Gather elements defined by the mask, leave others unchanged.
2907     template<int scale=4>
2908     void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
2909     template<int scale=4>
2910     void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
2911         gather_mask<scale> (vbool_t(mask), baseptr, vindex);
2912     }
2913 
2914     /// Store values at addresses  (char*)basepatr + vindex[i]*scale
2915     template<int scale=4>
2916     void scatter (value_t *baseptr, const vint_t& vindex) const;
2917     /// Scatter elements defined by the mask
2918     template<int scale=4>
2919     void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2920     template<int scale=4>
2921     void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
2922         scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
2923     }
2924 
2925     // Arithmetic operators (component-by-component)
2926     friend vfloat16 operator+ (const vfloat16& a, const vfloat16& b);
2927     friend vfloat16 operator- (const vfloat16& a);
2928     friend vfloat16 operator- (const vfloat16& a, const vfloat16& b);
2929     friend vfloat16 operator* (const vfloat16& a, const vfloat16& b);
2930     friend vfloat16 operator* (const vfloat16& a, float b);
2931     friend vfloat16 operator* (float a, const vfloat16& b);
2932     friend vfloat16 operator/ (const vfloat16& a, const vfloat16& b);
2933     friend vfloat16 operator% (const vfloat16& a, const vfloat16& b);
2934     friend const vfloat16 & operator+= (vfloat16& a, const vfloat16& b);
2935     friend const vfloat16 & operator-= (vfloat16& a, const vfloat16& b);
2936     friend const vfloat16 & operator*= (vfloat16& a, const vfloat16& b);
2937     friend const vfloat16 & operator/= (vfloat16& a, const vfloat16& b);
2938 
2939     // Comparison operations
2940     friend vbool16 operator== (const vfloat16& a, const vfloat16& b);
2941     friend vbool16 operator!= (const vfloat16& a, const vfloat16& b);
2942     friend vbool16 operator<  (const vfloat16& a, const vfloat16& b);
2943     friend vbool16 operator>  (const vfloat16& a, const vfloat16& b);
2944     friend vbool16 operator>= (const vfloat16& a, const vfloat16& b);
2945     friend vbool16 operator<= (const vfloat16& a, const vfloat16& b);
2946 
2947     // Some oddball items that are handy
2948 
2949     /// Stream output
2950     friend inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val);
2951 
2952 protected:
2953     // The actual data representation
2954     union {
2955         simd_t  m_simd;
2956         value_t m_val[paddedelements];
2957         vfloat8  m_8[2];
2958     };
2959 };
2960 
2961 
2962 /// Shuffle groups of 4
2963 template<int i0, int i1, int i2, int i3>
2964 vfloat16 shuffle4 (const vfloat16& a);
2965 
2966 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
2967 template<int i> vfloat16 shuffle4 (const vfloat16& a);
2968 
2969 /// Shuffle within each group of 4
2970 template<int i0, int i1, int i2, int i3>
2971 vfloat16 shuffle (const vfloat16& a);
2972 
2973 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2974 template<int i> vfloat16 shuffle (const vfloat16& a);
2975 
2976 /// Helper: as rapid as possible extraction of one component, when the
2977 /// index is fixed.
2978 template<int i> float extract (const vfloat16& a);
2979 
2980 /// Helper: substitute val for a[i]
2981 template<int i> vfloat16 insert (const vfloat16& a, float val);
2982 
2983 /// The sum of all components, returned in all components.
2984 vfloat16 vreduce_add (const vfloat16& v);
2985 
2986 /// The sum of all components, returned as a scalar.
2987 float reduce_add (const vfloat16& v);
2988 
2989 /// Use a bool mask to select between components of a (if mask[i] is false)
2990 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2991 vfloat16 blend (const vfloat16& a, const vfloat16& b, const vbool4& mask);
2992 
2993 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2994 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2995 /// blend(0,a,mask).
2996 vfloat16 blend0 (const vfloat16& a, const vbool4& mask);
2997 
2998 /// Use a bool mask to select between components of a (if mask[i] is false)
2999 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
3000 /// blend(0,a,!mask), or blend(a,0,mask).
3001 vfloat16 blend0not (const vfloat16& a, const vbool4& mask);
3002 
3003 /// "Safe" divide of vfloat16/vfloat16 -- for any component of the divisor
3004 /// that is 0, return 0 rather than Inf.
3005 vfloat16 safe_div (const vfloat16 &a, const vfloat16 &b);
3006 
3007 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
3008 /// synonym for blend with arguments rearranged, but this is more clear
3009 /// because the arguments are symmetric to scalar (cond ? a : b).
3010 vfloat16 select (const vbool16& mask, const vfloat16& a, const vfloat16& b);
3011 
3012 // Per-element math
3013 vfloat16 abs (const vfloat16& a);    ///< absolute value (float)
3014 vfloat16 sign (const vfloat16& a);   ///< 1.0 when value >= 0, -1 when negative
3015 vfloat16 ceil (const vfloat16& a);
3016 vfloat16 floor (const vfloat16& a);
3017 vint16 ifloor (const vfloat16& a);    ///< (int)floor
3018 inline vint16 floori (const vfloat16& a) { return ifloor(a); }  // DEPRECATED(1.8) alias
3019 
3020 /// Per-element round to nearest integer.
3021 /// CAVEAT: the rounding when mid-way between integers may differ depending
3022 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
3023 /// integer) but std::round() says to round away from 0 regardless of
3024 /// current rounding mode (but that is multiple instructions on x64).
3025 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
3026 /// match std::round().
3027 vfloat16 round (const vfloat16& a);
3028 
3029 /// Per-element round to nearest integer (equivalent to vint(round(a))).
3030 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
3031 /// C++ std::rint() which says to use the current rounding mode.
3032 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
3033 /// match std::rint().
3034 vint16 rint (const vfloat16& a);
3035 
3036 vfloat16 rcp_fast (const vfloat16 &a);  ///< Fast, approximate 1/a
3037 vfloat16 sqrt (const vfloat16 &a);
3038 vfloat16 rsqrt (const vfloat16 &a);   ///< Fully accurate 1/sqrt
3039 vfloat16 rsqrt_fast (const vfloat16 &a);  ///< Fast, approximate 1/sqrt
3040 vfloat16 min (const vfloat16& a, const vfloat16& b); ///< Per-element min
3041 vfloat16 max (const vfloat16& a, const vfloat16& b); ///< Per-element max
3042 // vfloat16 exp (const vfloat16& v);  // See template with vfloat4
3043 // vfloat16 log (const vfloat16& v);  // See template with vfloat4
3044 
3045 /// andnot(a,b) returns ((~a) & b)
3046 vfloat16 andnot (const vfloat16& a, const vfloat16& b);
3047 
3048 // Fused multiply and add (or subtract):
3049 vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b + c
3050 vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b - c
3051 vfloat16 nmadd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b + c
3052 vfloat16 nmsub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b - c
3053 
3054 
3055 
3056 // Odds and ends, other CPU hardware tricks
3057 
3058 // Try to set the flush_zero_mode CPU flag on x86. Return true if we are
3059 // able, otherwise false (because it's not available on that platform,
3060 // or because it's gcc 4.8 which has a bug that lacks this intrinsic).
3061 inline bool set_flush_zero_mode (bool on) {
3062 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3063     _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
3064     return true;
3065 #endif
3066     return false;
3067 }
3068 
3069 // Try to set the denorms_zero_mode CPU flag on x86. Return true if we are
3070 // able, otherwise false (because it's not available on that platform,
3071 // or because it's gcc 4.8 which has a bug that lacks this intrinsic).
3072 inline bool set_denorms_zero_mode (bool on) {
3073 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3074     _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
3075     return true;
3076 #endif
3077     return false;
3078 }
3079 
3080 // Get the flush_zero_mode CPU flag on x86.
3081 inline bool get_flush_zero_mode () {
3082 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3083     return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
3084 #endif
3085     return false;
3086 }
3087 
3088 // Get the denorms_zero_mode CPU flag on x86.
3089 inline bool get_denorms_zero_mode () {
3090 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3091     return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
3092 #endif
3093     return false;
3094 }
3095 
3096 
3097 
3098 
3099 
3100 
3101 //////////////////////////////////////////////////////////////////////////
3102 //////////////////////////////////////////////////////////////////////////
3103 //
3104 // Gory implementation details follow.
3105 //
3106 // ^^^ All declarations and documention is above ^^^
3107 //
3108 // vvv Below is the implementation, often considerably cluttered with
3109 //     #if's for each architeture, and unapologitic use of intrinsics and
3110 //     every manner of dirty trick we can think of to make things fast.
3111 //     Some of this isn't pretty. We won't recapitulate comments or
3112 //     documentation of what the functions are supposed to do, please
3113 //     consult the declarations above for that.
3114 //
3115 //     Here be dragons.
3116 //
3117 //////////////////////////////////////////////////////////////////////////
3118 //////////////////////////////////////////////////////////////////////////
3119 
3120 
3121 
3122 //////////////////////////////////////////////////////////////////////
3123 // vbool4 implementation
3124 
3125 
3126 OIIO_FORCEINLINE int vbool4::operator[] (int i) const {
3127     OIIO_DASSERT(i >= 0 && i < elements);
3128 #if OIIO_SIMD_SSE
3129     return ((_mm_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3130 #else
3131     return m_val[i];
3132 #endif
3133 }
3134 
3135 OIIO_FORCEINLINE int& vbool4::operator[] (int i) {
3136     OIIO_DASSERT(i >= 0 && i < elements);
3137     return m_val[i];
3138 }
3139 
3140 
3141 OIIO_FORCEINLINE void vbool4::setcomp (int i, bool value) {
3142     OIIO_DASSERT(i >= 0 && i < elements);
3143     m_val[i] = value ? -1 : 0;
3144 }
3145 
3146 
3147 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool4& a) {
3148     cout << a[0];
3149     for (int i = 1; i < a.elements; ++i)
3150         cout << ' ' << a[i];
3151     return cout;
3152 }
3153 
3154 
3155 OIIO_FORCEINLINE void vbool4::load (bool a) {
3156 #if OIIO_SIMD_SSE
3157     m_simd = _mm_castsi128_ps(_mm_set1_epi32(-int(a)));
3158 #elif OIIO_SIMD_NEON
3159     m_simd = vdupq_n_u32(a ? 0xffffffff : 0);
3160 #else
3161     int val = -int(a);
3162     SIMD_CONSTRUCT (val);
3163 #endif
3164 }
3165 
3166 
3167 OIIO_FORCEINLINE void vbool4::load (bool a, bool b, bool c, bool d) {
3168 #if OIIO_SIMD_SSE
3169     // N.B. -- we need to reverse the order because of our convention
3170     // of storing a,b,c,d in the same order in memory.
3171     m_simd = _mm_castsi128_ps(_mm_set_epi32(-int(d), -int(c), -int(b), -int(a)));
3172 // #elif OIIO_SIMD_NEON
3173 // FIXME
3174 #else
3175     m_val[0] = -int(a);
3176     m_val[1] = -int(b);
3177     m_val[2] = -int(c);
3178     m_val[3] = -int(d);
3179 #endif
3180 }
3181 
3182 OIIO_FORCEINLINE vbool4::vbool4 (const bool *a) {
3183     load (a[0], a[1], a[2], a[3]);
3184 }
3185 
3186 OIIO_FORCEINLINE const vbool4& vbool4::operator= (const vbool4 & other) {
3187     m_simd = other.m_simd;
3188     return *this;
3189 }
3190 
3191 
3192 OIIO_FORCEINLINE int vbool4::bitmask () const {
3193 #if OIIO_SIMD_SSE
3194     return _mm_movemask_ps(m_simd);
3195 #else
3196     int r = 0;
3197     for (int i = 0; i < elements; ++i)
3198         if (m_val[i])
3199             r |= 1<<i;
3200     return r;
3201 #endif
3202 }
3203 
3204 
3205 OIIO_FORCEINLINE vbool4
3206 vbool4::from_bitmask (int bitmask) {
3207     // I think this is a fast conversion from int bitmask to vbool4
3208     return (vint4::Giota() & vint4(bitmask)) != vint4::Zero();
3209 }
3210 
3211 
3212 OIIO_FORCEINLINE void vbool4::clear () {
3213 #if OIIO_SIMD_SSE
3214     m_simd = _mm_setzero_ps();
3215 #else
3216     *this = false;
3217 #endif
3218 }
3219 
3220 
3221 OIIO_FORCEINLINE const vbool4 vbool4::False () {
3222 #if OIIO_SIMD_SSE
3223     return _mm_setzero_ps();
3224 #else
3225     return false;
3226 #endif
3227 }
3228 
3229 OIIO_FORCEINLINE const vbool4 vbool4::True () {
3230     // Fastest way to fill with all 1 bits is to cmp any value to itself.
3231 #if OIIO_SIMD_SSE
3232 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
3233     __m128i anyval = _mm_undefined_si128();
3234 # else
3235     __m128i anyval = _mm_setzero_si128();
3236 # endif
3237     return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval));
3238 #else
3239     return true;
3240 #endif
3241 }
3242 
3243 OIIO_FORCEINLINE void vbool4::store (bool *values) const {
3244     SIMD_DO (values[i] = m_val[i] ? true : false);
3245 }
3246 
3247 OIIO_FORCEINLINE void vbool4::store (bool *values, int n) const {
3248     OIIO_DASSERT (n >= 0 && n <= elements);
3249     for (int i = 0; i < n; ++i)
3250         values[i] = m_val[i] ? true : false;
3251 }
3252 
3253 
3254 
3255 OIIO_FORCEINLINE vbool4 operator! (const vbool4 & a) {
3256 #if OIIO_SIMD_SSE
3257     return _mm_xor_ps (a.simd(), vbool4::True());
3258 #elif OIIO_SIMD_NEON
3259     return vmvnq_u32(a.simd());
3260 #else
3261     SIMD_RETURN (vbool4, a[i] ^ (-1));
3262 #endif
3263 }
3264 
3265 OIIO_FORCEINLINE vbool4 operator& (const vbool4 & a, const vbool4 & b) {
3266 #if OIIO_SIMD_SSE
3267     return _mm_and_ps (a.simd(), b.simd());
3268 #elif OIIO_SIMD_NEON
3269     return vandq_u32(a.simd(), b.simd());
3270 #else
3271     SIMD_RETURN (vbool4, a[i] & b[i]);
3272 #endif
3273 }
3274 
3275 OIIO_FORCEINLINE vbool4 operator| (const vbool4 & a, const vbool4 & b) {
3276 #if OIIO_SIMD_SSE
3277     return _mm_or_ps (a.simd(), b.simd());
3278 #elif OIIO_SIMD_NEON
3279     return vorrq_u32(a.simd(), b.simd());
3280 #else
3281     SIMD_RETURN (vbool4, a[i] | b[i]);
3282 #endif
3283 }
3284 
3285 OIIO_FORCEINLINE vbool4 operator^ (const vbool4& a, const vbool4& b) {
3286 #if OIIO_SIMD_SSE
3287     return _mm_xor_ps (a.simd(), b.simd());
3288 #elif OIIO_SIMD_NEON
3289     return veorq_u32(a.simd(), b.simd());
3290 #else
3291     SIMD_RETURN (vbool4, a[i] ^ b[i]);
3292 #endif
3293 }
3294 
3295 
3296 OIIO_FORCEINLINE const vbool4& operator&= (vbool4& a, const vbool4 &b) {
3297     return a = a & b;
3298 }
3299 
3300 OIIO_FORCEINLINE const vbool4& operator|= (vbool4& a, const vbool4& b) {
3301     return a = a | b;
3302 }
3303 
3304 OIIO_FORCEINLINE const vbool4& operator^= (vbool4& a, const vbool4& b) {
3305     return a = a ^ b;
3306 }
3307 
3308 OIIO_FORCEINLINE vbool4 operator~ (const vbool4& a) {
3309 #if OIIO_SIMD_SSE
3310     // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3311     return _mm_xor_ps (a.simd(), vbool4::True());
3312 #elif OIIO_SIMD_NEON
3313     return vmvnq_u32(a.m_simd);
3314 #else
3315     SIMD_RETURN (vbool4, ~a[i]);
3316 #endif
3317 }
3318 
3319 OIIO_FORCEINLINE vbool4 operator== (const vbool4 & a, const vbool4 & b) {
3320 #if OIIO_SIMD_SSE
3321     return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b)));
3322 #elif OIIO_SIMD_NEON
3323     return vceqq_u32 (a.m_simd, b.m_simd);
3324 #else
3325     SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
3326 #endif
3327 }
3328 
3329 OIIO_FORCEINLINE vbool4 operator!= (const vbool4 & a, const vbool4 & b) {
3330 #if OIIO_SIMD_SSE
3331     return _mm_xor_ps (a, b);
3332 #elif OIIO_SIMD_NEON
3333     return !(a == b);
3334 #else
3335     SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0);
3336 #endif
3337 }
3338 
3339 
3340 
3341 
3342 #if OIIO_SIMD_SSE
3343 // Shuffling. Use like this:  x = shuffle<3,2,1,0>(b)
3344 template<int i0, int i1, int i2, int i3>
3345 OIIO_FORCEINLINE __m128i shuffle_sse (__m128i v) {
3346     return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
3347 }
3348 #endif
3349 
3350 #if OIIO_SIMD_SSE >= 3
3351 // SSE3 has intrinsics for a few special cases
3352 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 0, 2, 2> (__m128i a) {
3353     return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(a)));
3354 }
3355 template<> OIIO_FORCEINLINE __m128i shuffle_sse<1, 1, 3, 3> (__m128i a) {
3356     return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(a)));
3357 }
3358 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 1, 0, 1> (__m128i a) {
3359     return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(a)));
3360 }
3361 #endif
3362 
3363 #if OIIO_SIMD_SSE
3364 template<int i0, int i1, int i2, int i3>
3365 OIIO_FORCEINLINE __m128 shuffle_sse (__m128 a) {
3366     return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0)));
3367 }
3368 #endif
3369 
3370 #if OIIO_SIMD_SSE >= 3
3371 // SSE3 has intrinsics for a few special cases
3372 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 0, 2, 2> (__m128 a) {
3373     return _mm_moveldup_ps(a);
3374 }
3375 template<> OIIO_FORCEINLINE __m128 shuffle_sse<1, 1, 3, 3> (__m128 a) {
3376     return _mm_movehdup_ps(a);
3377 }
3378 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 1, 0, 1> (__m128 a) {
3379     return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a)));
3380 }
3381 #endif
3382 
3383 
3384 /// Helper: shuffle/swizzle with constant (templated) indices.
3385 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
3386 template<int i0, int i1, int i2, int i3>
3387 OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
3388 #if OIIO_SIMD_SSE
3389     return shuffle_sse<i0,i1,i2,i3> (a.simd());
3390 #else
3391     return vbool4 (a[i0], a[i1], a[i2], a[i3]);
3392 #endif
3393 }
3394 
3395 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3396 template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
3397     return shuffle<i,i,i,i>(a);
3398 }
3399 
3400 
3401 /// Helper: as rapid as possible extraction of one component, when the
3402 /// index is fixed.
3403 template<int i>
3404 OIIO_FORCEINLINE bool extract (const vbool4& a) {
3405 #if OIIO_SIMD_SSE >= 4
3406     return _mm_extract_epi32(_mm_castps_si128(a.simd()), i);  // SSE4.1 only
3407 #else
3408     return a[i];
3409 #endif
3410 }
3411 
3412 /// Helper: substitute val for a[i]
3413 template<int i>
3414 OIIO_FORCEINLINE vbool4 insert (const vbool4& a, bool val) {
3415 #if OIIO_SIMD_SSE >= 4
3416     int ival = -int(val);
3417     return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
3418 #else
3419     vbool4 tmp = a;
3420     tmp[i] = -int(val);
3421     return tmp;
3422 #endif
3423 }
3424 
3425 OIIO_FORCEINLINE bool reduce_and (const vbool4& v) {
3426 #if OIIO_SIMD_AVX
3427     return _mm_testc_ps (v, vbool4(true)) != 0;
3428 #elif OIIO_SIMD_SSE
3429     return _mm_movemask_ps(v.simd()) == 0xf;
3430 #else
3431     SIMD_RETURN_REDUCE (bool, true, r &= (v[i] != 0));
3432 #endif
3433 }
3434 
3435 OIIO_FORCEINLINE bool reduce_or (const vbool4& v) {
3436 #if OIIO_SIMD_AVX
3437     return ! _mm_testz_ps (v, v);
3438 #elif OIIO_SIMD_SSE
3439     return _mm_movemask_ps(v) != 0;
3440 #else
3441     SIMD_RETURN_REDUCE (bool, false, r |= (v[i] != 0));
3442 #endif
3443 }
3444 
3445 OIIO_FORCEINLINE bool all (const vbool4& v) { return reduce_and(v) == true; }
3446 OIIO_FORCEINLINE bool any (const vbool4& v) { return reduce_or(v) == true; }
3447 OIIO_FORCEINLINE bool none (const vbool4& v) { return reduce_or(v) == false; }
3448 
3449 
3450 
3451 //////////////////////////////////////////////////////////////////////
3452 // vbool8 implementation
3453 
3454 
3455 OIIO_FORCEINLINE int vbool8::operator[] (int i) const {
3456     OIIO_DASSERT(i >= 0 && i < elements);
3457 #if OIIO_SIMD_AVX
3458     return ((_mm256_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3459 #else
3460     return m_val[i];
3461 #endif
3462 }
3463 
3464 OIIO_FORCEINLINE void vbool8::setcomp (int i, bool value) {
3465     OIIO_DASSERT(i >= 0 && i < elements);
3466     m_val[i] = value ? -1 : 0;
3467 }
3468 
3469 OIIO_FORCEINLINE int& vbool8::operator[] (int i) {
3470     OIIO_DASSERT(i >= 0 && i < elements);
3471     return m_val[i];
3472 }
3473 
3474 
3475 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool8& a) {
3476     cout << a[0];
3477     for (int i = 1; i < a.elements; ++i)
3478         cout << ' ' << a[i];
3479     return cout;
3480 }
3481 
3482 
3483 OIIO_FORCEINLINE void vbool8::load (bool a) {
3484 #if OIIO_SIMD_AVX
3485     m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-int(a)));
3486 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3487     m_4[0].load(a);
3488     m_4[1].load(a);
3489 #else
3490     int val = -int(a);
3491     SIMD_CONSTRUCT (val);
3492 #endif
3493 }
3494 
3495 
3496 OIIO_FORCEINLINE void vbool8::load (bool a, bool b, bool c, bool d,
3497                                    bool e, bool f, bool g, bool h) {
3498 #if OIIO_SIMD_AVX
3499     // N.B. -- we need to reverse the order because of our convention
3500     // of storing a,b,c,d in the same order in memory.
3501     m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-int(h), -int(g), -int(f), -int(e),
3502                                                   -int(d), -int(c), -int(b), -int(a)));
3503 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3504     m_4[0].load(a, b, c, d);
3505     m_4[1].load(e, f, g, h);
3506 #else
3507     m_val[0] = -int(a);
3508     m_val[1] = -int(b);
3509     m_val[2] = -int(c);
3510     m_val[3] = -int(d);
3511     m_val[4] = -int(e);
3512     m_val[5] = -int(f);
3513     m_val[6] = -int(g);
3514     m_val[7] = -int(h);
3515 #endif
3516 }
3517 
3518 OIIO_FORCEINLINE vbool8::vbool8 (bool a, bool b, bool c, bool d,
3519                                bool e, bool f, bool g, bool h) {
3520     load (a, b, c, d, e, f, g, h);
3521 }
3522 
3523 OIIO_FORCEINLINE vbool8::vbool8 (int a, int b, int c, int d,
3524                                  int e, int f, int g, int h) {
3525     load (bool(a), bool(b), bool(c), bool(d),
3526           bool(e), bool(f), bool(g), bool(h));
3527 }
3528 
3529 OIIO_FORCEINLINE vbool8::vbool8 (const bool *a) {
3530     load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
3531 }
3532 
3533 
3534 OIIO_FORCEINLINE const vbool8& vbool8::operator= (bool a) {
3535     load(a);
3536     return *this;
3537 }
3538 
3539 OIIO_FORCEINLINE const vbool8& vbool8::operator= (const vbool8 & other) {
3540     m_simd = other.m_simd;
3541     return *this;
3542 }
3543 
3544 OIIO_FORCEINLINE int vbool8::bitmask () const {
3545 #if OIIO_SIMD_AVX
3546     return _mm256_movemask_ps(m_simd);
3547 #else
3548     return lo().bitmask() | (hi().bitmask() << 4);
3549 #endif
3550 }
3551 
3552 
3553 OIIO_FORCEINLINE vbool8
3554 vbool8::from_bitmask (int bitmask) {
3555     // I think this is a fast conversion from int bitmask to vbool8
3556     return (vint8::Giota() & vint8(bitmask)) != vint8::Zero();
3557 }
3558 
3559 
3560 OIIO_FORCEINLINE void vbool8::clear () {
3561 #if OIIO_SIMD_AVX
3562     m_simd = _mm256_setzero_ps();
3563 #else
3564     *this = false;
3565 #endif
3566 }
3567 
3568 OIIO_FORCEINLINE const vbool8 vbool8::False () {
3569 #if OIIO_SIMD_AVX
3570     return _mm256_setzero_ps();
3571 #else
3572     return false;
3573 #endif
3574 }
3575 
3576 
3577 OIIO_FORCEINLINE const vbool8 vbool8::True () {
3578 #if OIIO_SIMD_AVX
3579 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000)
3580     // Fastest way to fill with all 1 bits is to cmp any value to itself.
3581     __m256i anyval = _mm256_undefined_si256();
3582     return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval));
3583 # else
3584     return _mm256_castsi256_ps (_mm256_set1_epi32 (-1));
3585 # endif
3586 #else
3587     return true;
3588 #endif
3589 }
3590 
3591 
3592 OIIO_FORCEINLINE void vbool8::store (bool *values) const {
3593     SIMD_DO (values[i] = m_val[i] ? true : false);
3594 }
3595 
3596 OIIO_FORCEINLINE void vbool8::store (bool *values, int n) const {
3597     OIIO_DASSERT (n >= 0 && n <= elements);
3598     for (int i = 0; i < n; ++i)
3599         values[i] = m_val[i] ? true : false;
3600 }
3601 
3602 
3603 OIIO_FORCEINLINE vbool4 vbool8::lo () const {
3604 #if OIIO_SIMD_AVX
3605     return _mm256_castps256_ps128 (simd());
3606 #else
3607     return m_4[0];
3608 #endif
3609 }
3610 
3611 OIIO_FORCEINLINE vbool4 vbool8::hi () const {
3612 #if OIIO_SIMD_AVX
3613     return _mm256_extractf128_ps (simd(), 1);
3614 #else
3615     return m_4[1];
3616 #endif
3617 }
3618 
3619 
3620 OIIO_FORCEINLINE vbool8::vbool8 (const vbool4& lo, const vbool4 &hi) {
3621 #if OIIO_SIMD_AVX
3622     __m256 r = _mm256_castps128_ps256 (lo);
3623     m_simd = _mm256_insertf128_ps (r, hi, 1);
3624     // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo);
3625 #else
3626     m_4[0] = lo;
3627     m_4[1] = hi;
3628 #endif
3629 }
3630 
3631 
3632 OIIO_FORCEINLINE vbool8 operator! (const vbool8 & a) {
3633 #if OIIO_SIMD_AVX
3634     return _mm256_xor_ps (a.simd(), vbool8::True());
3635 #else
3636     SIMD_RETURN (vbool8, a[i] ^ (-1));
3637 #endif
3638 }
3639 
3640 OIIO_FORCEINLINE vbool8 operator& (const vbool8 & a, const vbool8 & b) {
3641 #if OIIO_SIMD_AVX
3642     return _mm256_and_ps (a.simd(), b.simd());
3643 #else
3644     SIMD_RETURN (vbool8, a[i] & b[i]);
3645 #endif
3646 }
3647 
3648 OIIO_FORCEINLINE vbool8 operator| (const vbool8 & a, const vbool8 & b) {
3649 #if OIIO_SIMD_AVX
3650     return _mm256_or_ps (a.simd(), b.simd());
3651 #else
3652     SIMD_RETURN (vbool8, a[i] | b[i]);
3653 #endif
3654 }
3655 
3656 OIIO_FORCEINLINE vbool8 operator^ (const vbool8& a, const vbool8& b) {
3657 #if OIIO_SIMD_AVX
3658     return _mm256_xor_ps (a.simd(), b.simd());
3659 #else
3660     SIMD_RETURN (vbool8, a[i] ^ b[i]);
3661 #endif
3662 }
3663 
3664 
3665 OIIO_FORCEINLINE const vbool8& operator&= (vbool8& a, const vbool8 &b) {
3666     return a = a & b;
3667 }
3668 
3669 OIIO_FORCEINLINE const vbool8& operator|= (vbool8& a, const vbool8& b) {
3670     return a = a | b;
3671 }
3672 
3673 OIIO_FORCEINLINE const vbool8& operator^= (vbool8& a, const vbool8& b) {
3674     return a = a ^ b;
3675 }
3676 
3677 
3678 OIIO_FORCEINLINE vbool8 operator~ (const vbool8& a) {
3679 #if OIIO_SIMD_AVX
3680     // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3681     return _mm256_xor_ps (a.simd(), vbool8::True());
3682 #else
3683     SIMD_RETURN (vbool8, ~a[i]);
3684 #endif
3685 }
3686 
3687 
3688 OIIO_FORCEINLINE vbool8 operator== (const vbool8 & a, const vbool8 & b) {
3689 #if OIIO_SIMD_AVX >= 2
3690     return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b)));
3691 #elif OIIO_SIMD_AVX
3692     return _mm256_cmp_ps (a, b, _CMP_EQ_UQ);
3693 #else
3694     SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0);
3695 #endif
3696 }
3697 
3698 OIIO_FORCEINLINE vbool8 operator!= (const vbool8 & a, const vbool8 & b) {
3699 #if OIIO_SIMD_AVX
3700     return _mm256_xor_ps (a, b);
3701 #else
3702     SIMD_RETURN (vbool8, a[i] != b[i] ? -1 : 0);
3703 #endif
3704 }
3705 
3706 
3707 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
3708 OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
3709 #if OIIO_SIMD_AVX >= 2
3710     vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
3711     return _mm256_permutevar8x32_ps (a.simd(), index.simd());
3712 #else
3713     return vbool8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
3714 #endif
3715 }
3716 
3717 template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
3718     return shuffle<i,i,i,i,i,i,i,i>(a);
3719 }
3720 
3721 
3722 template<int i>
3723 OIIO_FORCEINLINE bool extract (const vbool8& a) {
3724 #if OIIO_SIMD_AVX && !_WIN32
3725     return _mm256_extract_epi32(_mm256_castps_si256(a.simd()), i);  // SSE4.1 only
3726 #else
3727     return a[i];
3728 #endif
3729 }
3730 
3731 template<int i>
3732 OIIO_FORCEINLINE vbool8 insert (const vbool8& a, bool val) {
3733 #if OIIO_SIMD_AVX && !_WIN32
3734     int ival = -int(val);
3735     return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.simd()), ival, i));
3736 #else
3737     vbool8 tmp = a;
3738     tmp[i] = -int(val);
3739     return tmp;
3740 #endif
3741 }
3742 
3743 
3744 OIIO_FORCEINLINE bool reduce_and (const vbool8& v) {
3745 #if OIIO_SIMD_AVX
3746     return _mm256_testc_ps (v, vbool8(true)) != 0;
3747     // return _mm256_movemask_ps(v.simd()) == 0xff;
3748 #else
3749     SIMD_RETURN_REDUCE (bool, true, r &= bool(v[i]));
3750 #endif
3751 }
3752 
3753 OIIO_FORCEINLINE bool reduce_or (const vbool8& v) {
3754 #if OIIO_SIMD_AVX
3755     return ! _mm256_testz_ps (v, v);   // FIXME? Not in all immintrin.h !
3756     // return _mm256_movemask_ps(v) != 0;
3757 #else
3758     SIMD_RETURN_REDUCE (bool, false, r |= bool(v[i]));
3759 #endif
3760 }
3761 
3762 
3763 OIIO_FORCEINLINE bool all (const vbool8& v) { return reduce_and(v) == true; }
3764 OIIO_FORCEINLINE bool any (const vbool8& v) { return reduce_or(v) == true; }
3765 OIIO_FORCEINLINE bool none (const vbool8& v) { return reduce_or(v) == false; }
3766 
3767 
3768 
3769 //////////////////////////////////////////////////////////////////////
3770 // vbool16 implementation
3771 
3772 
3773 OIIO_FORCEINLINE int vbool16::operator[] (int i) const {
3774     OIIO_DASSERT(i >= 0 && i < elements);
3775 #if OIIO_SIMD_AVX >= 512
3776     return (int(m_simd) >> i) & 1;
3777 #else
3778     return (m_bits >> i) & 1;
3779 #endif
3780 }
3781 
3782 OIIO_FORCEINLINE void vbool16::setcomp (int i, bool value) {
3783     OIIO_DASSERT(i >= 0 && i < elements);
3784     int bits = m_bits;
3785     bits &= (0xffff ^ (1<<i));
3786     bits |= (int(value)<<i);
3787     m_bits = bits;
3788 }
3789 
3790 
3791 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool16& a) {
3792     cout << a[0];
3793     for (int i = 1; i < a.elements; ++i)
3794         cout << ' ' << a[i];
3795     return cout;
3796 }
3797 
3798 
3799 OIIO_FORCEINLINE void vbool16::load (bool a) {
3800     m_simd = a ? 0xffff : 0;
3801 }
3802 
3803 
3804 OIIO_FORCEINLINE void vbool16::load_bitmask (int a) {
3805     m_simd = simd_t(a);
3806 }
3807 
3808 
3809 OIIO_FORCEINLINE void vbool16::load (bool v0, bool v1, bool v2, bool v3,
3810                                     bool v4, bool v5, bool v6, bool v7,
3811                                     bool v8, bool v9, bool v10, bool v11,
3812                                     bool v12, bool v13, bool v14, bool v15) {
3813     m_simd = simd_t((int(v0) << 0) |
3814                     (int(v1) << 1) |
3815                     (int(v2) << 2) |
3816                     (int(v3) << 3) |
3817                     (int(v4) << 4) |
3818                     (int(v5) << 5) |
3819                     (int(v6) << 6) |
3820                     (int(v7) << 7) |
3821                     (int(v8) << 8) |
3822                     (int(v9) << 9) |
3823                     (int(v10) << 10) |
3824                     (int(v11) << 11) |
3825                     (int(v12) << 12) |
3826                     (int(v13) << 13) |
3827                     (int(v14) << 14) |
3828                     (int(v15) << 15));
3829 }
3830 
3831 OIIO_FORCEINLINE vbool16::vbool16 (bool v0, bool v1, bool v2, bool v3,
3832                                  bool v4, bool v5, bool v6, bool v7,
3833                                  bool v8, bool v9, bool v10, bool v11,
3834                                  bool v12, bool v13, bool v14, bool v15) {
3835     load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
3836 }
3837 
3838 OIIO_FORCEINLINE vbool16::vbool16 (int v0, int v1, int v2, int v3,
3839                                    int v4, int v5, int v6, int v7,
3840                                    int v8, int v9, int v10, int v11,
3841                                    int v12, int v13, int v14, int v15) {
3842     load (bool(v0), bool(v1), bool(v2), bool(v3),
3843           bool(v4), bool(v5), bool(v6), bool(v7),
3844           bool(v8), bool(v9), bool(v10), bool(v11),
3845           bool(v12), bool(v13), bool(v14), bool(v15));
3846 }
3847 
3848 OIIO_FORCEINLINE vbool16::vbool16 (const vbool8& a, const vbool8& b) {
3849     load_bitmask (a.bitmask() | (b.bitmask() << 8));
3850 }
3851 
3852 OIIO_FORCEINLINE vbool16::vbool16 (const bool *a) {
3853     load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
3854           a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
3855 }
3856 
3857 
3858 OIIO_FORCEINLINE const vbool16& vbool16::operator= (bool a) {
3859     load(a);
3860     return *this;
3861 }
3862 
3863 OIIO_FORCEINLINE const vbool16& vbool16::operator= (const vbool16 & other) {
3864     m_simd = other.m_simd;
3865     return *this;
3866 }
3867 
3868 
3869 OIIO_FORCEINLINE int vbool16::bitmask () const {
3870 #if OIIO_SIMD_AVX >= 512
3871     return int(m_simd);
3872 #else
3873     return int(m_bits);
3874 #endif
3875 }
3876 
3877 
3878 OIIO_FORCEINLINE void vbool16::clear () {
3879     m_simd = simd_t(0);
3880 }
3881 
3882 OIIO_FORCEINLINE const vbool16 vbool16::False () {
3883     return simd_t(0);
3884 }
3885 
3886 
3887 OIIO_FORCEINLINE const vbool16 vbool16::True () {
3888     return simd_t(0xffff);
3889 }
3890 
3891 
3892 OIIO_FORCEINLINE void vbool16::store (bool *values) const {
3893     SIMD_DO (values[i] = m_bits & (1<<i));
3894 }
3895 
3896 OIIO_FORCEINLINE void vbool16::store (bool *values, int n) const {
3897     OIIO_DASSERT (n >= 0 && n <= elements);
3898     for (int i = 0; i < n; ++i)
3899         values[i] = m_bits & (1<<i);
3900 }
3901 
3902 
3903 
3904 OIIO_FORCEINLINE vbool8 vbool16::lo () const {
3905 #if OIIO_SIMD_AVX >= 512
3906     return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()&0xff, -1));
3907 #else
3908     SIMD_RETURN (vbool8, (*this)[i] ? -1 : 0);
3909 #endif
3910 }
3911 
3912 OIIO_FORCEINLINE vbool8 vbool16::hi () const {
3913 #if OIIO_SIMD_AVX >= 512
3914     return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()>>8, -1));
3915 #else
3916     SIMD_RETURN (vbool8, (*this)[i+8] ? -1 : 0);
3917 #endif
3918 }
3919 
3920 
3921 OIIO_FORCEINLINE vbool16 operator! (const vbool16 & a) {
3922 #if OIIO_SIMD_AVX >= 512
3923     return _mm512_knot (a.simd());
3924 #else
3925     return vbool16 (a.m_bits ^ 0xffff);
3926 #endif
3927 }
3928 
3929 OIIO_FORCEINLINE vbool16 operator& (const vbool16 & a, const vbool16 & b) {
3930 #if OIIO_SIMD_AVX >= 512
3931     return _mm512_kand (a.simd(), b.simd());
3932 #else
3933     return vbool16 (a.m_bits & b.m_bits);
3934 #endif
3935 }
3936 
3937 OIIO_FORCEINLINE vbool16 operator| (const vbool16 & a, const vbool16 & b) {
3938 #if OIIO_SIMD_AVX >= 512
3939     return _mm512_kor (a.simd(), b.simd());
3940 #else
3941     return vbool16 (a.m_bits | b.m_bits);
3942 #endif
3943 }
3944 
3945 OIIO_FORCEINLINE vbool16 operator^ (const vbool16& a, const vbool16& b) {
3946 #if OIIO_SIMD_AVX >= 512
3947     return _mm512_kxor (a.simd(), b.simd());
3948 #else
3949     return vbool16 (a.m_bits ^ b.m_bits);
3950 #endif
3951 }
3952 
3953 
3954 OIIO_FORCEINLINE const vbool16& operator&= (vbool16& a, const vbool16 &b) {
3955     return a = a & b;
3956 }
3957 
3958 OIIO_FORCEINLINE const vbool16& operator|= (vbool16& a, const vbool16& b) {
3959     return a = a | b;
3960 }
3961 
3962 OIIO_FORCEINLINE const vbool16& operator^= (vbool16& a, const vbool16& b) {
3963     return a = a ^ b;
3964 }
3965 
3966 
3967 OIIO_FORCEINLINE vbool16 operator~ (const vbool16& a) {
3968     return a ^ vbool16::True();
3969 }
3970 
3971 
3972 OIIO_FORCEINLINE vbool16 operator== (const vbool16 & a, const vbool16 & b) {
3973 #if OIIO_SIMD_AVX >= 512
3974     return _mm512_kxnor (a.simd(), b.simd());
3975 #else
3976     return vbool16 (!(a.m_bits ^ b.m_bits));
3977 #endif
3978 }
3979 
3980 OIIO_FORCEINLINE vbool16 operator!= (const vbool16 & a, const vbool16 & b) {
3981 #if OIIO_SIMD_AVX >= 512
3982     return _mm512_kxor (a.simd(), b.simd());
3983 #else
3984     return vbool16 (a.m_bits ^ b.m_bits);
3985 #endif
3986 }
3987 
3988 
3989 template<int i>
3990 OIIO_FORCEINLINE bool extract (const vbool16& a) {
3991     return a[i];
3992 }
3993 
3994 template<int i>
3995 OIIO_FORCEINLINE vbool16 insert (const vbool16& a, bool val) {
3996     vbool16 tmp = a;
3997     tmp.setcomp (i, val);
3998     return tmp;
3999 }
4000 
4001 
4002 OIIO_FORCEINLINE bool reduce_and (const vbool16& v) {
4003     return v.bitmask() == 0xffff;
4004 }
4005 
4006 OIIO_FORCEINLINE bool reduce_or (const vbool16& v) {
4007     return v.bitmask() != 0;
4008 }
4009 
4010 
4011 OIIO_FORCEINLINE bool all (const vbool16& v) { return reduce_and(v) == true; }
4012 OIIO_FORCEINLINE bool any (const vbool16& v) { return reduce_or(v) == true; }
4013 OIIO_FORCEINLINE bool none (const vbool16& v) { return reduce_or(v) == false; }
4014 
4015 
4016 
4017 
4018 
4019 
4020 //////////////////////////////////////////////////////////////////////
4021 // vint4 implementation
4022 
4023 OIIO_FORCEINLINE const vint4 & vint4::operator= (const vint4& other) {
4024     m_simd = other.m_simd;
4025     return *this;
4026 }
4027 
4028 OIIO_FORCEINLINE int vint4::operator[] (int i) const {
4029     OIIO_DASSERT(i<elements);
4030     return m_val[i];
4031 }
4032 
4033 OIIO_FORCEINLINE int& vint4::operator[] (int i) {
4034     OIIO_DASSERT(i<elements);
4035     return m_val[i];
4036 }
4037 
4038 OIIO_FORCEINLINE void vint4::setcomp (int i, int val) {
4039     OIIO_DASSERT(i<elements);
4040     m_val[i] = val;
4041 }
4042 
4043 
4044 OIIO_FORCEINLINE void vint4::load (int a) {
4045 #if OIIO_SIMD_SSE
4046     m_simd = _mm_set1_epi32 (a);
4047 #elif OIIO_SIMD_NEON
4048     m_simd = vdupq_n_s32 (a);
4049 #else
4050     SIMD_CONSTRUCT (a);
4051 #endif
4052 }
4053 
4054 
4055 
4056 OIIO_FORCEINLINE void vint4::load (int a, int b, int c, int d) {
4057 #if OIIO_SIMD_SSE
4058     m_simd = _mm_set_epi32 (d, c, b, a);
4059 #elif OIIO_SIMD_NEON
4060     int values[4] = { a, b, c, d };
4061     m_simd = vld1q_s32 (values);
4062 #else
4063     m_val[0] = a;
4064     m_val[1] = b;
4065     m_val[2] = c;
4066     m_val[3] = d;
4067 #endif
4068 }
4069 
4070 
4071 // OIIO_FORCEINLINE void vint4::load (int a, int b, int c, int d,
4072 //                                   int e, int f, int g, int h) {
4073 //     load (a, b, c, d);
4074 // }
4075 
4076 
4077 
4078 OIIO_FORCEINLINE void vint4::load (const int *values) {
4079 #if OIIO_SIMD_SSE
4080     m_simd = _mm_loadu_si128 ((const simd_t *)values);
4081 #else
4082     SIMD_CONSTRUCT (values[i]);
4083 #endif
4084 }
4085 
4086 
4087 OIIO_FORCEINLINE void vint4::load (const int *values, int n)
4088 {
4089     OIIO_DASSERT (n >= 0 && n <= elements);
4090 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4091     m_simd = _mm_maskz_loadu_epi32 (__mmask8(~(0xf << n)), values);
4092 #elif OIIO_SIMD_SSE
4093     switch (n) {
4094     case 1:
4095         m_simd = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4096         break;
4097     case 2:
4098         // Trickery: load one double worth of bits!
4099         m_simd = _mm_castpd_si128 (_mm_load_sd ((const double*)values));
4100         break;
4101     case 3:
4102         // Trickery: load one double worth of bits, then a float,
4103         // and combine, casting to ints.
4104         m_simd = _mm_castps_si128 (_mm_movelh_ps(_mm_castpd_ps(_mm_load_sd((const double*)values)),
4105                                                 _mm_load_ss ((const float *)values + 2)));
4106         break;
4107     case 4:
4108         m_simd = _mm_loadu_si128 ((const simd_t *)values);
4109         break;
4110     default:
4111         clear ();
4112         break;
4113     }
4114 #else
4115     for (int i = 0; i < n; ++i)
4116         m_val[i] = values[i];
4117     for (int i = n; i < elements; ++i)
4118         m_val[i] = 0;
4119 #endif
4120 }
4121 
4122 
4123 OIIO_FORCEINLINE void vint4::load (const unsigned short *values) {
4124 #if OIIO_SIMD_SSE >= 4
4125     // Trickery: load one double worth of bits = 4 ushorts!
4126     simd_t a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
4127     m_simd = _mm_cvtepu16_epi32 (a);
4128 #else
4129     SIMD_CONSTRUCT (values[i]);
4130 #endif
4131 }
4132 
4133 
4134 OIIO_FORCEINLINE void vint4::load (const short *values) {
4135 #if OIIO_SIMD_SSE >= 4
4136     // Trickery: load one double worth of bits = 4 shorts!
4137     simd_t a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
4138     m_simd = _mm_cvtepi16_epi32 (a);
4139 #else
4140     SIMD_CONSTRUCT (values[i]);
4141 #endif
4142 }
4143 
4144 
4145 OIIO_FORCEINLINE void vint4::load (const unsigned char *values) {
4146 #if OIIO_SIMD_SSE >= 4
4147     // Trickery: load one float worth of bits = 4 uchars!
4148     simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4149     m_simd = _mm_cvtepu8_epi32 (a);
4150 #else
4151     SIMD_CONSTRUCT (values[i]);
4152 #endif
4153 }
4154 
4155 
4156 OIIO_FORCEINLINE void vint4::load (const char *values) {
4157 #if OIIO_SIMD_SSE >= 4
4158     // Trickery: load one float worth of bits = 4 chars!
4159     simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4160     m_simd = _mm_cvtepi8_epi32 (a);
4161 #else
4162     SIMD_CONSTRUCT (values[i]);
4163 #endif
4164 }
4165 
4166 
4167 OIIO_FORCEINLINE vint4::vint4 (int a) { load(a); }
4168 
4169 OIIO_FORCEINLINE vint4::vint4 (int a, int b) { load(a,a,b,b); }
4170 
4171 OIIO_FORCEINLINE vint4::vint4 (int a, int b, int c, int d) { load(a,b,c,d); }
4172 
4173 // OIIO_FORCEINLINE vint4::vint4 (int a, int b, int c, int d,
4174 //                              int e, int f, int g, int h) {
4175 //     load(a,b,c,d,e,f,g,h);
4176 // }
4177 
4178 OIIO_FORCEINLINE vint4::vint4 (const int *vals) { load (vals); }
4179 OIIO_FORCEINLINE vint4::vint4 (const unsigned short *vals) { load(vals); }
4180 OIIO_FORCEINLINE vint4::vint4 (const short *vals) { load(vals); }
4181 OIIO_FORCEINLINE vint4::vint4 (const unsigned char *vals) { load(vals); }
4182 OIIO_FORCEINLINE vint4::vint4 (const char *vals) { load(vals); }
4183 
4184 OIIO_FORCEINLINE const vint4 & vint4::operator= (int a) { load(a); return *this; }
4185 
4186 
4187 OIIO_FORCEINLINE void vint4::store (int *values) const {
4188 #if OIIO_SIMD_SSE
4189     // Use an unaligned store -- it's just as fast when the memory turns
4190     // out to be aligned, nearly as fast even when unaligned. Not worth
4191     // the headache of using stores that require alignment.
4192     _mm_storeu_si128 ((simd_t *)values, m_simd);
4193 #else
4194     SIMD_DO (values[i] = m_val[i]);
4195 #endif
4196 }
4197 
4198 
4199 OIIO_FORCEINLINE void vint4::load_mask (int mask, const value_t *values) {
4200 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4201     m_simd = _mm_maskz_loadu_epi32 (__mmask8(mask), (const simd_t *)values);
4202 #elif OIIO_SIMD_AVX >= 2
4203     m_simd = _mm_maskload_epi32 (values, _mm_castps_si128(vbool_t::from_bitmask(mask)));
4204 #else
4205     SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0);
4206 #endif
4207 }
4208 
4209 
4210 OIIO_FORCEINLINE void vint4::load_mask (const vbool_t& mask, const value_t *values) {
4211 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4212     m_simd = _mm_maskz_loadu_epi32 (__mmask8(mask.bitmask()), (const simd_t *)values);
4213 #elif OIIO_SIMD_AVX >= 2
4214     m_simd = _mm_maskload_epi32 (values, _mm_castps_si128(mask));
4215 #else
4216     SIMD_CONSTRUCT (mask[i] ? values[i] : 0);
4217 #endif
4218 }
4219 
4220 
4221 OIIO_FORCEINLINE void vint4::store_mask (int mask, value_t *values) const {
4222 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4223     _mm_mask_storeu_epi32 (values, __mmask8(mask), m_simd);
4224 #elif OIIO_SIMD_AVX >= 2
4225     _mm_maskstore_epi32 (values, _mm_castps_si128(vbool_t::from_bitmask(mask)), m_simd);
4226 #else
4227     SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
4228 #endif
4229 }
4230 
4231 
4232 OIIO_FORCEINLINE void vint4::store_mask (const vbool_t& mask, value_t *values) const {
4233 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4234     _mm_mask_storeu_epi32 (values, mask.bitmask(), m_simd);
4235 #elif OIIO_SIMD_AVX >= 2
4236     _mm_maskstore_epi32 (values, _mm_castps_si128(mask), m_simd);
4237 #else
4238     SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
4239 #endif
4240 }
4241 
4242 
4243 template <int scale>
4244 OIIO_FORCEINLINE void
4245 vint4::gather (const value_t *baseptr, const vint_t& vindex)
4246 {
4247 #if OIIO_SIMD_AVX >= 2
4248     m_simd = _mm_i32gather_epi32 (baseptr, vindex, scale);
4249 #else
4250     SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
4251 #endif
4252 }
4253 
4254 template<int scale>
4255 OIIO_FORCEINLINE void
4256 vint4::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex)
4257 {
4258 #if OIIO_SIMD_AVX >= 2
4259     m_simd = _mm_mask_i32gather_epi32 (m_simd, baseptr, vindex, _mm_cvtps_epi32(mask), scale);
4260 #else
4261     SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0);
4262 #endif
4263 }
4264 
4265 template<int scale>
4266 OIIO_FORCEINLINE void
4267 vint4::scatter (value_t *baseptr, const vint_t& vindex) const
4268 {
4269 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4270     // FIXME: disable because it benchmarks slower than the dumb way
4271     _mm_i32scatter_epi32 (baseptr, vindex, m_simd, scale);
4272 #else
4273     SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
4274 #endif
4275 }
4276 
4277 template<int scale>
4278 OIIO_FORCEINLINE void
4279 vint4::scatter_mask (const bool_t& mask, value_t *baseptr,
4280                      const vint_t& vindex) const
4281 {
4282 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4283     // FIXME: disable because it benchmarks slower than the dumb way
4284     _mm_mask_i32scatter_epi32 (baseptr, mask.bitmask(), vindex, m_simd, scale);
4285 #else
4286     SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
4287 #endif
4288 }
4289 
4290 
4291 OIIO_FORCEINLINE void vint4::clear () {
4292 #if OIIO_SIMD_SSE
4293     m_simd = _mm_setzero_si128();
4294 #else
4295     *this = 0;
4296 #endif
4297 }
4298 
4299 
4300 
4301 OIIO_FORCEINLINE const vint4 vint4::Zero () {
4302 #if OIIO_SIMD_SSE
4303     return _mm_setzero_si128();
4304 #else
4305     return 0;
4306 #endif
4307 }
4308 
4309 
4310 OIIO_FORCEINLINE const vint4 vint4::One () { return vint4(1); }
4311 
4312 OIIO_FORCEINLINE const vint4 vint4::NegOne () {
4313 #if OIIO_SIMD_SSE
4314     // Fastest way to fill an __m128 with all 1 bits is to cmpeq_epi8
4315     // any value to itself.
4316 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
4317     __m128i anyval = _mm_undefined_si128();
4318 # else
4319     __m128i anyval = _mm_setzero_si128();
4320 # endif
4321     return _mm_cmpeq_epi8 (anyval, anyval);
4322 #else
4323     return vint4(-1);
4324 #endif
4325 }
4326 
4327 
4328 
4329 OIIO_FORCEINLINE const vint4 vint4::Iota (int start, int step) {
4330     return vint4 (start+0*step, start+1*step, start+2*step, start+3*step);
4331 }
4332 
4333 
4334 OIIO_FORCEINLINE const vint4 vint4::Giota () {
4335     return vint4 (1<<0, 1<<1, 1<<2, 1<<3);
4336 }
4337 
4338 
4339 OIIO_FORCEINLINE vint4 operator+ (const vint4& a, const vint4& b) {
4340 #if OIIO_SIMD_SSE
4341     return _mm_add_epi32 (a.simd(), b.simd());
4342 #else
4343     SIMD_RETURN (vint4, a[i] + b[i]);
4344 #endif
4345 }
4346 
4347 OIIO_FORCEINLINE const vint4& operator+= (vint4& a, const vint4& b) {
4348     return a = a + b;
4349 }
4350 
4351 
4352 OIIO_FORCEINLINE vint4 operator- (const vint4& a) {
4353 #if OIIO_SIMD_SSE
4354     return _mm_sub_epi32 (_mm_setzero_si128(), a);
4355 #else
4356     SIMD_RETURN (vint4, -a[i]);
4357 #endif
4358 }
4359 
4360 
4361 OIIO_FORCEINLINE vint4 operator- (const vint4& a, const vint4& b) {
4362 #if OIIO_SIMD_SSE
4363     return _mm_sub_epi32 (a.simd(), b.simd());
4364 #else
4365     SIMD_RETURN (vint4, a[i] - b[i]);
4366 #endif
4367 }
4368 
4369 
4370 OIIO_FORCEINLINE const vint4 &operator-= (vint4& a, const vint4& b) {
4371     return a = a - b;
4372 }
4373 
4374 
4375 #if OIIO_SIMD_SSE
4376 // Shamelessly lifted from Syrah which lifted from Manta which lifted it
4377 // from intel.com
4378 OIIO_FORCEINLINE __m128i mul_epi32 (__m128i a, __m128i b) {
4379 #if OIIO_SIMD_SSE >= 4  /* SSE >= 4.1 */
4380     return _mm_mullo_epi32(a, b);
4381 #else
4382     // Prior to SSE 4.1, there is no _mm_mullo_epi32 instruction, so we have
4383     // to fake it.
4384     __m128i t0;
4385     __m128i t1;
4386     t0 = _mm_mul_epu32 (a, b);
4387     t1 = _mm_mul_epu32 (_mm_shuffle_epi32 (a, 0xB1),
4388                         _mm_shuffle_epi32 (b, 0xB1));
4389     t0 = _mm_shuffle_epi32 (t0, 0xD8);
4390     t1 = _mm_shuffle_epi32 (t1, 0xD8);
4391     return _mm_unpacklo_epi32 (t0, t1);
4392 #endif
4393 }
4394 #endif
4395 
4396 
4397 OIIO_FORCEINLINE vint4 operator* (const vint4& a, const vint4& b) {
4398 #if OIIO_SIMD_SSE
4399     return mul_epi32 (a.simd(), b.simd());
4400 #else
4401     SIMD_RETURN (vint4, a[i] * b[i]);
4402 #endif
4403 }
4404 
4405 
4406 OIIO_FORCEINLINE const vint4& operator*= (vint4& a, const vint4& b) { return a = a * b; }
4407 OIIO_FORCEINLINE const vint4& operator*= (vint4& a, int b) { return a = a * b; }
4408 
4409 
4410 OIIO_FORCEINLINE vint4 operator/ (const vint4& a, const vint4& b) {
4411     // NO INTEGER DIVISION IN SSE!
4412     SIMD_RETURN (vint4, a[i] / b[i]);
4413 }
4414 
4415 
4416 OIIO_FORCEINLINE const vint4& operator/= (vint4& a, const vint4& b) { return a = a / b; }
4417 
4418 OIIO_FORCEINLINE vint4 operator% (const vint4& a, const vint4& b) {
4419     // NO INTEGER MODULUS IN SSE!
4420     SIMD_RETURN (vint4, a[i] % b[i]);
4421 }
4422 
4423 
4424 
4425 OIIO_FORCEINLINE const vint4& operator%= (vint4& a, const vint4& b) { return a = a % b; }
4426 
4427 
4428 OIIO_FORCEINLINE vint4 operator% (const vint4& a, int w) {
4429     // NO INTEGER MODULUS in SSE!
4430     SIMD_RETURN (vint4, a[i] % w);
4431 }
4432 
4433 
4434 OIIO_FORCEINLINE const vint4& operator%= (vint4& a, int b) { return a = a % b; }
4435 
4436 
4437 OIIO_FORCEINLINE vint4 operator& (const vint4& a, const vint4& b) {
4438 #if OIIO_SIMD_SSE
4439     return _mm_and_si128 (a.simd(), b.simd());
4440 #elif OIIO_SIMD_NEON
4441     return vandq_s32(a.simd(), b.simd());
4442 #else
4443     SIMD_RETURN (vint4, a[i] & b[i]);
4444 #endif
4445 }
4446 
4447 
4448 OIIO_FORCEINLINE const vint4& operator&= (vint4& a, const vint4& b) { return a = a & b; }
4449 
4450 
4451 
4452 OIIO_FORCEINLINE vint4 operator| (const vint4& a, const vint4& b) {
4453 #if OIIO_SIMD_SSE
4454     return _mm_or_si128 (a.simd(), b.simd());
4455 #elif OIIO_SIMD_NEON
4456     return vorrq_s32(a.simd(), b.simd());
4457 #else
4458     SIMD_RETURN (vint4, a[i] | b[i]);
4459 #endif
4460 }
4461 
4462 OIIO_FORCEINLINE const vint4& operator|= (vint4& a, const vint4& b) { return a = a | b; }
4463 
4464 
4465 OIIO_FORCEINLINE vint4 operator^ (const vint4& a, const vint4& b) {
4466 #if OIIO_SIMD_SSE
4467     return _mm_xor_si128 (a.simd(), b.simd());
4468 #elif OIIO_SIMD_NEON
4469     return veorq_s32(a.simd(), b.simd());
4470 #else
4471     SIMD_RETURN (vint4, a[i] ^ b[i]);
4472 #endif
4473 }
4474 
4475 
4476 OIIO_FORCEINLINE const vint4& operator^= (vint4& a, const vint4& b) { return a = a ^ b; }
4477 
4478 
4479 OIIO_FORCEINLINE vint4 operator~ (const vint4& a) {
4480 #if OIIO_SIMD_SSE
4481     return a ^ a.NegOne();
4482 #elif OIIO_SIMD_NEON
4483     return vmvnq_s32(a.m_simd);
4484 #else
4485     SIMD_RETURN (vint4, ~a[i]);
4486 #endif
4487 }
4488 
4489 OIIO_FORCEINLINE vint4 operator<< (const vint4& a, unsigned int bits) {
4490 #if OIIO_SIMD_SSE
4491     return _mm_slli_epi32 (a, bits);
4492 #else
4493     SIMD_RETURN (vint4, a[i] << bits);
4494 #endif
4495 }
4496 
4497 OIIO_FORCEINLINE const vint4& operator<<= (vint4& a, const unsigned int bits) {
4498     return a = a << bits;
4499 }
4500 
4501 
4502 OIIO_FORCEINLINE vint4 operator>> (const vint4& a, const unsigned int bits) {
4503 #if OIIO_SIMD_SSE
4504     return _mm_srai_epi32 (a, bits);
4505 #else
4506     SIMD_RETURN (vint4, a[i] >> bits);
4507 #endif
4508 }
4509 
4510 OIIO_FORCEINLINE const vint4& operator>>= (vint4& a, const unsigned int bits) {
4511     return a = a >> bits;
4512 }
4513 
4514 
4515 OIIO_FORCEINLINE vint4 srl (const vint4& a, const unsigned int bits) {
4516 #if OIIO_SIMD_SSE
4517     return _mm_srli_epi32 (a, bits);
4518 #else
4519     SIMD_RETURN (vint4, int ((unsigned int)(a[i]) >> bits));
4520 #endif
4521 }
4522 
4523 
4524 OIIO_FORCEINLINE vbool4 operator== (const vint4& a, const vint4& b) {
4525 #if OIIO_SIMD_SSE
4526     return _mm_castsi128_ps(_mm_cmpeq_epi32 (a, b));
4527 #elif OIIO_SIMD_NEON
4528     return vceqq_s32 (a.m_simd, b.m_simd);
4529 #else
4530     SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
4531 #endif
4532 }
4533 
4534 OIIO_FORCEINLINE vbool4 operator!= (const vint4& a, const vint4& b) {
4535     return ! (a == b);
4536 }
4537 
4538 
4539 OIIO_FORCEINLINE vbool4 operator> (const vint4& a, const vint4& b) {
4540 #if OIIO_SIMD_SSE
4541     return _mm_castsi128_ps(_mm_cmpgt_epi32 (a, b));
4542 #else
4543     SIMD_RETURN (vbool4, a[i] > b[i] ? -1 : 0);
4544 #endif
4545 }
4546 
4547 OIIO_FORCEINLINE vbool4 operator< (const vint4& a, const vint4& b) {
4548 #if OIIO_SIMD_SSE
4549     return _mm_castsi128_ps(_mm_cmplt_epi32 (a, b));
4550 #else
4551     SIMD_RETURN (vbool4, a[i] < b[i] ? -1 : 0);
4552 #endif
4553 }
4554 
4555 OIIO_FORCEINLINE vbool4 operator>= (const vint4& a, const vint4& b) {
4556     return (b < a) | (a == b);
4557 }
4558 
4559 OIIO_FORCEINLINE vbool4 operator<= (const vint4& a, const vint4& b) {
4560     return (b > a) | (a == b);
4561 }
4562 
4563 inline std::ostream& operator<< (std::ostream& cout, const vint4& val) {
4564     cout << val[0];
4565     for (int i = 1; i < val.elements; ++i)
4566         cout << ' ' << val[i];
4567     return cout;
4568 }
4569 
4570 
4571 OIIO_FORCEINLINE void vint4::store (int *values, int n) const {
4572     OIIO_DASSERT (n >= 0 && n <= elements);
4573 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4574     // This SHOULD be fast, but in my benchmarks, it is slower!
4575     // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
4576     // Re-test this periodically with new Intel hardware.
4577     _mm_mask_storeu_epi32 (values, __mmask8(~(0xf << n)), m_simd);
4578 #elif OIIO_SIMD
4579     // For full SIMD, there is a speed advantage to storing all components.
4580     if (n == elements)
4581         store (values);
4582     else
4583         for (int i = 0; i < n; ++i)
4584             values[i] = m_val[i];
4585 #else
4586     for (int i = 0; i < n; ++i)
4587         values[i] = m_val[i];
4588 #endif
4589 }
4590 
4591 
4592 
4593 OIIO_FORCEINLINE void vint4::store (unsigned short *values) const {
4594 #if OIIO_AVX512VL_ENABLED
4595     _mm_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xf), m_simd);
4596 #elif OIIO_SIMD_SSE
4597     // Expressed as half-words and considering little endianness, we
4598     // currently have AxBxCxDx (the 'x' means don't care).
4599     vint4 clamped = m_simd & vint4(0xffff);   // A0B0C0D0
4600     vint4 low = _mm_shufflelo_epi16 (clamped, (0<<0) | (2<<2) | (1<<4) | (1<<6));
4601                     // low = AB00xxxx
4602     vint4 high = _mm_shufflehi_epi16 (clamped, (1<<0) | (1<<2) | (0<<4) | (2<<6));
4603                     // high = xxxx00CD
4604     vint4 highswapped = shuffle_sse<2,3,0,1>(high);  // 00CDxxxx
4605     vint4 result = low | highswapped;   // ABCDxxxx
4606     _mm_storel_pd ((double *)values, _mm_castsi128_pd(result));
4607     // At this point, values[] should hold A,B,C,D
4608 #else
4609     SIMD_DO (values[i] = m_val[i]);
4610 #endif
4611 }
4612 
4613 
4614 
4615 OIIO_FORCEINLINE void vint4::store (unsigned char *values) const {
4616 #if OIIO_AVX512VL_ENABLED
4617     _mm_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xf), m_simd);
4618 #elif OIIO_SIMD_SSE
4619     // Expressed as bytes and considering little endianness, we
4620     // currently have AxBxCxDx (the 'x' means don't care).
4621     vint4 clamped = m_simd & vint4(0xff);          // A000 B000 C000 D000
4622     vint4 swapped = shuffle_sse<1,0,3,2>(clamped); // B000 A000 D000 C000
4623     vint4 shifted = swapped << 8;                  // 0B00 0A00 0D00 0C00
4624     vint4 merged = clamped | shifted;              // AB00 xxxx CD00 xxxx
4625     vint4 merged2 = shuffle_sse<2,2,2,2>(merged);  // CD00 ...
4626     vint4 shifted2 = merged2 << 16;                // 00CD ...
4627     vint4 result = merged | shifted2;              // ABCD ...
4628     *(int*)values = result[0]; //extract<0>(result);
4629     // At this point, values[] should hold A,B,C,D
4630 #else
4631     SIMD_DO (values[i] = m_val[i]);
4632 #endif
4633 }
4634 
4635 
4636 
4637 
4638 template<int i0, int i1, int i2, int i3>
4639 OIIO_FORCEINLINE vint4 shuffle (const vint4& a) {
4640 #if OIIO_SIMD_SSE
4641     return shuffle_sse<i0,i1,i2,i3> (__m128i(a));
4642 #else
4643     return vint4(a[i0], a[i1], a[i2], a[i3]);
4644 #endif
4645 }
4646 
4647 template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a) { return shuffle<i,i,i,i>(a); }
4648 
4649 
4650 template<int i>
4651 OIIO_FORCEINLINE int extract (const vint4& v) {
4652 #if OIIO_SIMD_SSE >= 4
4653     return _mm_extract_epi32(v.simd(), i);  // SSE4.1 only
4654 #else
4655     return v[i];
4656 #endif
4657 }
4658 
4659 #if OIIO_SIMD_SSE
4660 template<> OIIO_FORCEINLINE int extract<0> (const vint4& v) {
4661     return _mm_cvtsi128_si32(v.simd());
4662 }
4663 #endif
4664 
4665 template<int i>
4666 OIIO_FORCEINLINE vint4 insert (const vint4& a, int val) {
4667 #if OIIO_SIMD_SSE >= 4
4668     return _mm_insert_epi32 (a.simd(), val, i);
4669 #else
4670     vint4 tmp = a;
4671     tmp[i] = val;
4672     return tmp;
4673 #endif
4674 }
4675 
4676 
4677 
4678 OIIO_FORCEINLINE int vint4::x () const { return extract<0>(*this); }
4679 OIIO_FORCEINLINE int vint4::y () const { return extract<1>(*this); }
4680 OIIO_FORCEINLINE int vint4::z () const { return extract<2>(*this); }
4681 OIIO_FORCEINLINE int vint4::w () const { return extract<3>(*this); }
4682 OIIO_FORCEINLINE void vint4::set_x (int val) { *this = insert<0>(*this, val); }
4683 OIIO_FORCEINLINE void vint4::set_y (int val) { *this = insert<1>(*this, val); }
4684 OIIO_FORCEINLINE void vint4::set_z (int val) { *this = insert<2>(*this, val); }
4685 OIIO_FORCEINLINE void vint4::set_w (int val) { *this = insert<3>(*this, val); }
4686 
4687 
4688 OIIO_FORCEINLINE vint4 bitcast_to_int (const vbool4& x)
4689 {
4690 #if OIIO_SIMD_SSE
4691     return _mm_castps_si128 (x.simd());
4692 #else
4693     return *(vint4 *)&x;
4694 #endif
4695 }
4696 
4697 // Old names: (DEPRECATED 1.8)
4698 inline vint4 bitcast_to_int4 (const vbool4& x) { return bitcast_to_int(x); }
4699 
4700 
4701 OIIO_FORCEINLINE vint4 vreduce_add (const vint4& v) {
4702 #if OIIO_SIMD_SSE >= 3
4703     // People seem to agree that SSE3 does add reduction best with 2
4704     // horizontal adds.
4705     // suppose v = (a, b, c, d)
4706     simd::vint4 ab_cd = _mm_hadd_epi32 (v.simd(), v.simd());
4707     // ab_cd = (a+b, c+d, a+b, c+d)
4708     simd::vint4 abcd = _mm_hadd_epi32 (ab_cd.simd(), ab_cd.simd());
4709     // all abcd elements are a+b+c+d, return an element as fast as possible
4710     return abcd;
4711 #elif OIIO_SIMD_SSE >= 2
4712     // I think this is the best we can do for SSE2, and I'm still not sure
4713     // it's faster than the default scalar operation. But anyway...
4714     // suppose v = (a, b, c, d)
4715     vint4 ab_ab_cd_cd = shuffle<1,0,3,2>(v) + v;
4716     // ab_ab_cd_cd = (b,a,d,c) + (a,b,c,d) = (a+b,a+b,c+d,c+d)
4717     vint4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd);
4718     // cd_cd_ab_ab = (c+d,c+d,a+b,a+b)
4719     vint4 abcd = ab_ab_cd_cd + cd_cd_ab_ab;   // a+b+c+d in all components
4720     return abcd;
4721 #else
4722     return vint4(reduce_add(v));
4723 #endif
4724 }
4725 
4726 
4727 OIIO_FORCEINLINE int reduce_add (const vint4& v) {
4728 #if OIIO_SIMD_SSE
4729     return extract<0> (vreduce_add(v));
4730 #elif OIIO_SIMD_NEON
4731     return vaddvq_s32(v);
4732 #else
4733     SIMD_RETURN_REDUCE (int, 0, r += v[i]);
4734 #endif
4735 }
4736 
4737 
4738 OIIO_FORCEINLINE int reduce_and (const vint4& v) {
4739 #if OIIO_SIMD_SSE
4740     vint4 ab = v & shuffle<1,1,3,3>(v); // ab bb cd dd
4741     vint4 abcd = ab & shuffle<2>(ab);
4742     return extract<0>(abcd);
4743 #else
4744     SIMD_RETURN_REDUCE (int, -1, r &= v[i]);
4745 #endif
4746 }
4747 
4748 
4749 OIIO_FORCEINLINE int reduce_or (const vint4& v) {
4750 #if OIIO_SIMD_SSE
4751     vint4 ab = v | shuffle<1,1,3,3>(v); // ab bb cd dd
4752     vint4 abcd = ab | shuffle<2>(ab);
4753     return extract<0>(abcd);
4754 #else
4755     SIMD_RETURN_REDUCE (int, 0, r |= v[i]);
4756 #endif
4757 }
4758 
4759 
4760 
4761 OIIO_FORCEINLINE vint4 blend (const vint4& a, const vint4& b, const vbool4& mask) {
4762 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
4763     return _mm_castps_si128 (_mm_blendv_ps (_mm_castsi128_ps(a.simd()),
4764                                             _mm_castsi128_ps(b.simd()), mask));
4765 #elif OIIO_SIMD_SSE
4766     return _mm_or_si128 (_mm_and_si128(_mm_castps_si128(mask.simd()), b.simd()),
4767                          _mm_andnot_si128(_mm_castps_si128(mask.simd()), a.simd()));
4768 #elif OIIO_SIMD_NEON
4769     return vbslq_s32 (mask.simd(), b.simd(), a.simd());
4770 #else
4771     SIMD_RETURN (vint4, mask[i] ? b[i] : a[i]);
4772 #endif
4773 }
4774 
4775 OIIO_FORCEINLINE vint4 blend0 (const vint4& a, const vbool4& mask) {
4776 #if OIIO_SIMD_SSE
4777     return _mm_and_si128(_mm_castps_si128(mask), a.simd());
4778 #else
4779     SIMD_RETURN (vint4, mask[i] ? a[i] : 0.0f);
4780 #endif
4781 }
4782 
4783 
4784 OIIO_FORCEINLINE vint4 blend0not (const vint4& a, const vbool4& mask) {
4785 #if OIIO_SIMD_SSE
4786     return _mm_andnot_si128(_mm_castps_si128(mask), a.simd());
4787 #else
4788     SIMD_RETURN (vint4, mask[i] ? 0.0f : a[i]);
4789 #endif
4790 }
4791 
4792 
4793 OIIO_FORCEINLINE vint4 select (const vbool4& mask, const vint4& a, const vint4& b) {
4794     return blend (b, a, mask);
4795 }
4796 
4797 
4798 
4799 OIIO_FORCEINLINE vint4 abs (const vint4& a) {
4800 #if OIIO_SIMD_SSE >= 3
4801     return _mm_abs_epi32(a.simd());
4802 #elif OIIO_SIMD_NEON
4803     return vabsq_s32(a.simd());
4804 #else
4805     SIMD_RETURN (vint4, std::abs(a[i]));
4806 #endif
4807 }
4808 
4809 
4810 
4811 OIIO_FORCEINLINE vint4 min (const vint4& a, const vint4& b) {
4812 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
4813     return _mm_min_epi32 (a, b);
4814 #elif OIIO_SIMD_NEON
4815     return vminq_s32(a, b);
4816 #else
4817     SIMD_RETURN (vint4, std::min(a[i], b[i]));
4818 #endif
4819 }
4820 
4821 
4822 OIIO_FORCEINLINE vint4 max (const vint4& a, const vint4& b) {
4823 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
4824     return _mm_max_epi32 (a, b);
4825 #elif OIIO_SIMD_NEON
4826     return vmaxq_s32(a, b);
4827 #else
4828     SIMD_RETURN (vint4, std::max(a[i], b[i]));
4829 #endif
4830 }
4831 
4832 
4833 OIIO_FORCEINLINE vint4 rotl(const vint4& x, int s) {
4834 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4835     // return _mm_rol_epi32 (x, s);
4836     // We want to do this ^^^ but this intrinsic only takes an *immediate*
4837     // argument for s, and there isn't a way to express in C++ that a
4838     // parameter must be an immediate/literal value from the caller.
4839     return (x<<s) | srl(x,32-s);
4840 #else
4841     return (x<<s) | srl(x,32-s);
4842 #endif
4843 }
4844 
4845 // DEPRECATED (2.1)
4846 OIIO_FORCEINLINE vint4 rotl32 (const vint4& x, const unsigned int k) {
4847     return rotl(x, k);
4848 }
4849 
4850 
4851 OIIO_FORCEINLINE vint4 andnot (const vint4& a, const vint4& b) {
4852 #if OIIO_SIMD_SSE
4853     return _mm_andnot_si128 (a.simd(), b.simd());
4854 #else
4855     SIMD_RETURN (vint4, ~(a[i]) & b[i]);
4856 #endif
4857 }
4858 
4859 
4860 // Implementation had to be after the definition of vint4::Zero.
4861 OIIO_FORCEINLINE vbool4::vbool4 (const vint4& ival) {
4862     m_simd = (ival != vint4::Zero());
4863 }
4864 
4865 
4866 
4867 OIIO_FORCEINLINE vint4 safe_mod (const vint4& a, const vint4& b) {
4868     // NO INTEGER MODULUS IN SSE!
4869     SIMD_RETURN (vint4, b[i] ? a[i] % b[i] : 0);
4870 }
4871 
4872 OIIO_FORCEINLINE vint4 safe_mod (const vint4& a, int b) {
4873     return b ? (a % b) : vint4::Zero();
4874 }
4875 
4876 
4877 
4878 
4879 //////////////////////////////////////////////////////////////////////
4880 // vint8 implementation
4881 
4882 OIIO_FORCEINLINE const vint8 & vint8::operator= (const vint8& other) {
4883     m_simd = other.m_simd;
4884     return *this;
4885 }
4886 
4887 OIIO_FORCEINLINE int vint8::operator[] (int i) const {
4888     OIIO_DASSERT(i<elements);
4889     return m_val[i];
4890 }
4891 
4892 OIIO_FORCEINLINE int& vint8::operator[] (int i) {
4893     OIIO_DASSERT(i<elements);
4894     return m_val[i];
4895 }
4896 
4897 OIIO_FORCEINLINE void vint8::setcomp (int i, int val) {
4898     OIIO_DASSERT(i<elements);
4899     m_val[i] = val;
4900 }
4901 
4902 
4903 OIIO_FORCEINLINE void vint8::load (int a) {
4904 #if OIIO_SIMD_AVX
4905     m_simd = _mm256_set1_epi32 (a);
4906 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4907     m_4[0].load(a);
4908     m_4[1].load(a);
4909 #else
4910     SIMD_CONSTRUCT (a);
4911 #endif
4912 }
4913 
4914 
4915 OIIO_FORCEINLINE void vint8::load (int a, int b, int c, int d,
4916                                   int e, int f, int g, int h) {
4917 #if OIIO_SIMD_AVX
4918     m_simd = _mm256_set_epi32 (h, g, f, e, d, c, b, a);
4919 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4920     m_4[0].load(a, b, c, d);
4921     m_4[1].load(e, f, g, h);
4922 #else
4923     m_val[0] = a;
4924     m_val[1] = b;
4925     m_val[2] = c;
4926     m_val[3] = d;
4927     m_val[4] = e;
4928     m_val[5] = f;
4929     m_val[6] = g;
4930     m_val[7] = h;
4931 #endif
4932 }
4933 
4934 
4935 OIIO_FORCEINLINE void vint8::load (const int *values) {
4936 #if OIIO_SIMD_AVX
4937     m_simd = _mm256_loadu_si256 ((const simd_t *)values);
4938 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4939     m_4[0].load(values);
4940     m_4[1].load(values+4);
4941 #else
4942     SIMD_CONSTRUCT (values[i]);
4943 #endif
4944 }
4945 
4946 
4947 OIIO_FORCEINLINE void vint8::load (const int *values, int n)
4948 {
4949     OIIO_DASSERT (n >= 0 && n <= elements);
4950 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4951     m_simd = _mm256_maskz_loadu_epi32 ((~(0xff << n)), values);
4952 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4953     if (n > 4) {
4954         vint4 lo, hi;
4955         lo.load (values);
4956         hi.load (values+4, n-4);
4957         m_4[0] = lo;
4958         m_4[1] = hi;
4959     } else {
4960         vint4 lo, hi;
4961         lo.load (values, n);
4962         hi.clear();
4963         m_4[0] = lo;
4964         m_4[1] = hi;
4965     }
4966 #else
4967     for (int i = 0; i < n; ++i)
4968         m_val[i] = values[i];
4969     for (int i = n; i < elements; ++i)
4970         m_val[i] = 0;
4971 #endif
4972 }
4973 
4974 
4975 OIIO_FORCEINLINE void vint8::load (const short *values) {
4976 #if OIIO_SIMD_AVX >= 2
4977     m_simd = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)values));
4978 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4979     m_4[0].load(values);
4980     m_4[1].load(values+4);
4981 #else
4982     SIMD_CONSTRUCT (values[i]);
4983 #endif
4984 }
4985 
4986 OIIO_FORCEINLINE void vint8::load (const unsigned short *values) {
4987 #if OIIO_SIMD_AVX >= 2
4988     m_simd = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)values));
4989 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4990     m_4[0].load(values);
4991     m_4[1].load(values+4);
4992 #else
4993     SIMD_CONSTRUCT (values[i]);
4994 #endif
4995 }
4996 
4997 
4998 OIIO_FORCEINLINE void vint8::load (const char *values) {
4999 #if OIIO_SIMD_AVX >= 2
5000     __m128i bytes = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
5001     m_simd = _mm256_cvtepi8_epi32 (bytes);
5002 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5003     m_4[0].load(values);
5004     m_4[1].load(values+4);
5005 #else
5006     SIMD_CONSTRUCT (values[i]);
5007 #endif
5008 }
5009 
5010 OIIO_FORCEINLINE void vint8::load (const unsigned char *values) {
5011 #if OIIO_SIMD_AVX >= 2
5012     __m128i bytes = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
5013     m_simd = _mm256_cvtepu8_epi32 (bytes);
5014 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5015     m_4[0].load(values);
5016     m_4[1].load(values+4);
5017 #else
5018     SIMD_CONSTRUCT (values[i]);
5019 #endif
5020 }
5021 
5022 
5023 
5024 OIIO_FORCEINLINE vint8::vint8 (int a) { load(a); }
5025 
5026 OIIO_FORCEINLINE vint8::vint8 (int a, int b, int c, int d,
5027                              int e, int f, int g, int h) {
5028     load(a,b,c,d,e,f,g,h);
5029 }
5030 
5031 OIIO_FORCEINLINE vint8::vint8 (const int *vals) { load (vals); }
5032 OIIO_FORCEINLINE vint8::vint8 (const unsigned short *vals) { load(vals); }
5033 OIIO_FORCEINLINE vint8::vint8 (const short *vals) { load(vals); }
5034 OIIO_FORCEINLINE vint8::vint8 (const unsigned char *vals) { load(vals); }
5035 OIIO_FORCEINLINE vint8::vint8 (const char *vals) { load(vals); }
5036 
5037 OIIO_FORCEINLINE const vint8 & vint8::operator= (int a) { load(a); return *this; }
5038 
5039 
5040 OIIO_FORCEINLINE void vint8::store (int *values) const {
5041 #if OIIO_SIMD_AVX
5042     // Use an unaligned store -- it's just as fast when the memory turns
5043     // out to be aligned, nearly as fast even when unaligned. Not worth
5044     // the headache of using stores that require alignment.
5045     _mm256_storeu_si256 ((simd_t *)values, m_simd);
5046 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5047     m_4[0].store(values);
5048     m_4[1].store(values+4);
5049 #else
5050     SIMD_DO (values[i] = m_val[i]);
5051 #endif
5052 }
5053 
5054 
5055 OIIO_FORCEINLINE void vint8::load_mask (int mask, const int *values) {
5056 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5057     m_simd = _mm256_maskz_loadu_epi32 (__mmask8(mask), (const simd_t *)values);
5058 #elif OIIO_SIMD_AVX >= 2
5059     m_simd = _mm256_maskload_epi32 (values, _mm256_castps_si256(vbool8::from_bitmask(mask)));
5060 #else
5061     SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0);
5062 #endif
5063 }
5064 
5065 
5066 OIIO_FORCEINLINE void vint8::load_mask (const vbool8& mask, const int *values) {
5067 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5068     m_simd = _mm256_maskz_loadu_epi32 (__mmask8(mask.bitmask()), (const simd_t *)values);
5069 #elif OIIO_SIMD_AVX >= 2
5070     m_simd = _mm256_maskload_epi32 (values, _mm256_castps_si256(mask));
5071 #else
5072     SIMD_CONSTRUCT (mask[i] ? values[i] : 0);
5073 #endif
5074 }
5075 
5076 
5077 OIIO_FORCEINLINE void vint8::store_mask (int mask, int *values) const {
5078 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5079     _mm256_mask_storeu_epi32 (values, __mmask8(mask), m_simd);
5080 #elif OIIO_SIMD_AVX >= 2
5081     _mm256_maskstore_epi32 (values, _mm256_castps_si256(vbool8::from_bitmask(mask)), m_simd);
5082 #else
5083     SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
5084 #endif
5085 }
5086 
5087 
5088 OIIO_FORCEINLINE void vint8::store_mask (const vbool8& mask, int *values) const {
5089 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5090     _mm256_mask_storeu_epi32 (values, __mmask8(mask.bitmask()), m_simd);
5091 #elif OIIO_SIMD_AVX >= 2
5092     _mm256_maskstore_epi32 (values, _mm256_castps_si256(mask), m_simd);
5093 #else
5094     SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
5095 #endif
5096 }
5097 
5098 
5099 template <int scale>
5100 OIIO_FORCEINLINE void
5101 vint8::gather (const value_t *baseptr, const vint_t& vindex)
5102 {
5103 #if OIIO_SIMD_AVX >= 2
5104     m_simd = _mm256_i32gather_epi32 (baseptr, vindex, scale);
5105 #else
5106     SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
5107 #endif
5108 }
5109 
5110 template<int scale>
5111 OIIO_FORCEINLINE void
5112 vint8::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex)
5113 {
5114 #if OIIO_SIMD_AVX >= 2
5115     m_simd = _mm256_mask_i32gather_epi32 (m_simd, baseptr, vindex, _mm256_cvtps_epi32(mask), scale);
5116 #else
5117     SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0);
5118 #endif
5119 }
5120 
5121 template<int scale>
5122 OIIO_FORCEINLINE void
5123 vint8::scatter (value_t *baseptr, const vint_t& vindex) const
5124 {
5125 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5126     _mm256_i32scatter_epi32 (baseptr, vindex, m_simd, scale);
5127 #else
5128     SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
5129 #endif
5130 }
5131 
5132 template<int scale>
5133 OIIO_FORCEINLINE void
5134 vint8::scatter_mask (const bool_t& mask, value_t *baseptr,
5135                      const vint_t& vindex) const
5136 {
5137 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5138     _mm256_mask_i32scatter_epi32 (baseptr, mask.bitmask(), vindex, m_simd, scale);
5139 #else
5140     SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
5141 #endif
5142 }
5143 
5144 
5145 OIIO_FORCEINLINE void vint8::clear () {
5146 #if OIIO_SIMD_AVX
5147     m_simd = _mm256_setzero_si256();
5148 #else
5149     *this = 0;
5150 #endif
5151 }
5152 
5153 
5154 OIIO_FORCEINLINE const vint8 vint8::Zero () {
5155 #if OIIO_SIMD_AVX
5156     return _mm256_setzero_si256();
5157 #else
5158     return 0;
5159 #endif
5160 }
5161 
5162 OIIO_FORCEINLINE const vint8 vint8::One () { return vint8(1); }
5163 
5164 OIIO_FORCEINLINE const vint8 vint8::NegOne () { return vint8(-1); }
5165 
5166 
5167 OIIO_FORCEINLINE const vint8 vint8::Iota (int start, int step) {
5168     return vint8 (start+0*step, start+1*step, start+2*step, start+3*step,
5169                  start+4*step, start+5*step, start+6*step, start+7*step);
5170 }
5171 
5172 
5173 OIIO_FORCEINLINE const vint8 vint8::Giota () {
5174     return vint8 (1<<0, 1<<1, 1<<2, 1<<3,  1<<4, 1<<5, 1<<6, 1<<7);
5175 }
5176 
5177 
5178 OIIO_FORCEINLINE vint4 vint8::lo () const {
5179 #if OIIO_SIMD_AVX
5180     return _mm256_castsi256_si128 (simd());
5181 #else
5182     return m_4[0];
5183 #endif
5184 }
5185 
5186 OIIO_FORCEINLINE vint4 vint8::hi () const {
5187 #if OIIO_SIMD_AVX
5188     return _mm256_extractf128_si256 (simd(), 1);
5189 #else
5190     return m_4[1];
5191 #endif
5192 }
5193 
5194 
5195 OIIO_FORCEINLINE vint8::vint8 (const vint4& lo, const vint4 &hi) {
5196 #if OIIO_SIMD_AVX
5197     __m256i r = _mm256_castsi128_si256 (lo);
5198     m_simd = _mm256_insertf128_si256 (r, hi, 1);
5199     // N.B. equivalent, if available: m_simd = _mm256_set_m128i (hi, lo);
5200     // FIXME: when would this not be available?
5201 #else
5202     m_4[0] = lo;
5203     m_4[1] = hi;
5204 #endif
5205 }
5206 
5207 
5208 OIIO_FORCEINLINE vint8 operator+ (const vint8& a, const vint8& b) {
5209 #if OIIO_SIMD_AVX >= 2
5210     return _mm256_add_epi32 (a.simd(), b.simd());
5211 #else
5212     SIMD_RETURN (vint8, a[i] + b[i]);
5213 #endif
5214 }
5215 
5216 
5217 OIIO_FORCEINLINE const vint8& operator+= (vint8& a, const vint8& b) {
5218     return a = a + b;
5219 }
5220 
5221 
5222 OIIO_FORCEINLINE vint8 operator- (const vint8& a) {
5223 #if OIIO_SIMD_AVX >= 2
5224     return _mm256_sub_epi32 (_mm256_setzero_si256(), a);
5225 #else
5226     SIMD_RETURN (vint8, -a[i]);
5227 #endif
5228 }
5229 
5230 
5231 OIIO_FORCEINLINE vint8 operator- (const vint8& a, const vint8& b) {
5232 #if OIIO_SIMD_AVX >= 2
5233     return _mm256_sub_epi32 (a.simd(), b.simd());
5234 #else
5235     SIMD_RETURN (vint8, a[i] - b[i]);
5236 #endif
5237 }
5238 
5239 
5240 OIIO_FORCEINLINE const vint8 &operator-= (vint8& a, const vint8& b) {
5241     return a = a - b;
5242 }
5243 
5244 
5245 OIIO_FORCEINLINE vint8 operator* (const vint8& a, const vint8& b) {
5246 #if OIIO_SIMD_AVX >= 2
5247     return _mm256_mullo_epi32 (a.simd(), b.simd());
5248 #else
5249     SIMD_RETURN (vint8, a[i] * b[i]);
5250 #endif
5251 }
5252 
5253 
5254 OIIO_FORCEINLINE const vint8& operator*= (vint8& a, const vint8& b) { return a = a * b; }
5255 OIIO_FORCEINLINE const vint8& operator*= (vint8& a, int b) { return a = a * b; }
5256 
5257 
5258 OIIO_FORCEINLINE vint8 operator/ (const vint8& a, const vint8& b) {
5259     // NO INTEGER DIVISION IN SSE or AVX!
5260     SIMD_RETURN (vint8, a[i] / b[i]);
5261 }
5262 
5263 OIIO_FORCEINLINE const vint8& operator/= (vint8& a, const vint8& b) { return a = a / b; }
5264 
5265 
5266 OIIO_FORCEINLINE vint8 operator% (const vint8& a, const vint8& b) {
5267     // NO INTEGER MODULUS IN SSE or AVX!
5268     SIMD_RETURN (vint8, a[i] % b[i]);
5269 }
5270 
5271 OIIO_FORCEINLINE const vint8& operator%= (vint8& a, const vint8& b) { return a = a % b; }
5272 
5273 OIIO_FORCEINLINE vint8 operator% (const vint8& a, int w) {
5274     // NO INTEGER MODULUS in SSE or AVX!
5275     SIMD_RETURN (vint8, a[i] % w);
5276 }
5277 
5278 OIIO_FORCEINLINE const vint8& operator%= (vint8& a, int b) { return a = a % b; }
5279 
5280 
5281 OIIO_FORCEINLINE vint8 operator& (const vint8& a, const vint8& b) {
5282 #if OIIO_SIMD_AVX >= 2
5283     return _mm256_and_si256 (a.simd(), b.simd());
5284 #else
5285     SIMD_RETURN (vint8, a[i] & b[i]);
5286 #endif
5287 }
5288 
5289 OIIO_FORCEINLINE const vint8& operator&= (vint8& a, const vint8& b) { return a = a & b; }
5290 
5291 OIIO_FORCEINLINE vint8 operator| (const vint8& a, const vint8& b) {
5292 #if OIIO_SIMD_AVX >= 2
5293     return _mm256_or_si256 (a.simd(), b.simd());
5294 #else
5295     SIMD_RETURN (vint8, a[i] | b[i]);
5296 #endif
5297 }
5298 
5299 OIIO_FORCEINLINE const vint8& operator|= (vint8& a, const vint8& b) { return a = a | b; }
5300 
5301 OIIO_FORCEINLINE vint8 operator^ (const vint8& a, const vint8& b) {
5302 #if OIIO_SIMD_AVX >= 2
5303     return _mm256_xor_si256 (a.simd(), b.simd());
5304 #else
5305     SIMD_RETURN (vint8, a[i] ^ b[i]);
5306 #endif
5307 }
5308 
5309 OIIO_FORCEINLINE const vint8& operator^= (vint8& a, const vint8& b) { return a = a ^ b; }
5310 
5311 
5312 OIIO_FORCEINLINE vint8 operator~ (const vint8& a) {
5313 #if OIIO_SIMD_AVX >= 2
5314     return a ^ a.NegOne();
5315 #else
5316     SIMD_RETURN (vint8, ~a[i]);
5317 #endif
5318 }
5319 
5320 
5321 OIIO_FORCEINLINE vint8 operator<< (const vint8& a, unsigned int bits) {
5322 #if OIIO_SIMD_AVX >= 2
5323     return _mm256_slli_epi32 (a, bits);
5324 #elif OIIO_SIMD_SSE
5325     return vint8 (a.lo() << bits, a.hi() << bits);
5326 #else
5327     SIMD_RETURN (vint8, a[i] << bits);
5328 #endif
5329 }
5330 
5331 
5332 OIIO_FORCEINLINE const vint8& operator<<= (vint8& a, const unsigned int bits) {
5333     return a = a << bits;
5334 }
5335 
5336 OIIO_FORCEINLINE vint8 operator>> (const vint8& a, const unsigned int bits) {
5337 #if OIIO_SIMD_AVX >= 2
5338     return _mm256_srai_epi32 (a, bits);
5339 #elif OIIO_SIMD_SSE
5340     return vint8 (a.lo() >> bits, a.hi() >> bits);
5341 #else
5342     SIMD_RETURN (vint8, a[i] >> bits);
5343 #endif
5344 }
5345 
5346 OIIO_FORCEINLINE const vint8& operator>>= (vint8& a, const unsigned int bits) {
5347     return a = a >> bits;
5348 }
5349 
5350 
5351 OIIO_FORCEINLINE vint8 srl (const vint8& a, const unsigned int bits) {
5352 #if OIIO_SIMD_AVX >= 2
5353     return _mm256_srli_epi32 (a, bits);
5354 #else
5355     SIMD_RETURN (vint8, int ((unsigned int)(a[i]) >> bits));
5356 #endif
5357 }
5358 
5359 
5360 OIIO_FORCEINLINE vbool8 operator== (const vint8& a, const vint8& b) {
5361     // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5362 #if OIIO_SIMD_AVX >= 2
5363     return _mm256_castsi256_ps(_mm256_cmpeq_epi32 (a.m_simd, b.m_simd));
5364 #elif OIIO_SIMD_SSE  /* Fall back to 4-wide */
5365     return vbool8 (a.lo() == b.lo(), a.hi() == b.hi());
5366 #else
5367     SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0);
5368 #endif
5369 }
5370 
5371 
5372 OIIO_FORCEINLINE vbool8 operator!= (const vint8& a, const vint8& b) {
5373     // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5374     return ! (a == b);
5375 }
5376 
5377 
5378 OIIO_FORCEINLINE vbool8 operator> (const vint8& a, const vint8& b) {
5379     // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5380 #if OIIO_SIMD_AVX >= 2
5381     return _mm256_castsi256_ps(_mm256_cmpgt_epi32 (a, b));
5382 #elif OIIO_SIMD_SSE  /* Fall back to 4-wide */
5383     return vbool8 (a.lo() > b.lo(), a.hi() > b.hi());
5384 #else
5385     SIMD_RETURN (vbool8, a[i] > b[i] ? -1 : 0);
5386 #endif
5387 }
5388 
5389 
5390 OIIO_FORCEINLINE vbool8 operator< (const vint8& a, const vint8& b) {
5391     // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5392 #if OIIO_SIMD_AVX >= 2
5393     // No lt or lte!
5394     return (b > a);
5395 #elif OIIO_SIMD_SSE  /* Fall back to 4-wide */
5396     return vbool8 (a.lo() < b.lo(), a.hi() < b.hi());
5397 #else
5398     SIMD_RETURN (vbool8, a[i] < b[i] ? -1 : 0);
5399 #endif
5400 }
5401 
5402 
5403 OIIO_FORCEINLINE vbool8 operator>= (const vint8& a, const vint8& b) {
5404     // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5405     return (a > b) | (a == b);
5406 }
5407 
5408 
5409 OIIO_FORCEINLINE vbool8 operator<= (const vint8& a, const vint8& b) {
5410     // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5411     return (b > a) | (a == b);
5412 }
5413 
5414 
5415 inline std::ostream& operator<< (std::ostream& cout, const vint8& val) {
5416     cout << val[0];
5417     for (int i = 1; i < val.elements; ++i)
5418         cout << ' ' << val[i];
5419     return cout;
5420 }
5421 
5422 
5423 OIIO_FORCEINLINE void vint8::store (int *values, int n) const {
5424     OIIO_DASSERT (n >= 0 && n <= elements);
5425 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5426     // This SHOULD be fast, but in my benchmarks, it is slower!
5427     // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
5428     // Re-test this periodically with new Intel hardware.
5429     _mm256_mask_storeu_epi32 (values, __mmask8(~(0xff << n)), m_simd);
5430 #elif OIIO_SIMD_SSE
5431     if (n <= 4) {
5432         lo().store (values, n);
5433     } else if (n < 8) {
5434         lo().store (values);
5435         hi().store (values+4, n-4);
5436     } else {
5437         store (values);
5438     }
5439 #else
5440     for (int i = 0; i < n; ++i)
5441         values[i] = m_val[i];
5442 #endif
5443 }
5444 
5445 
5446 // FIXME(AVX): fast vint8 store to unsigned short, unsigned char
5447 
5448 OIIO_FORCEINLINE void vint8::store (unsigned short *values) const {
5449 #if OIIO_AVX512VL_ENABLED
5450     _mm256_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xff), m_simd);
5451 #elif OIIO_SIMD_SSE
5452     lo().store (values);
5453     hi().store (values+4);
5454 #else
5455     SIMD_DO (values[i] = m_val[i]);
5456 #endif
5457 }
5458 
5459 
5460 OIIO_FORCEINLINE void vint8::store (unsigned char *values) const {
5461 #if OIIO_AVX512VL_ENABLED
5462     _mm256_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xff), m_simd);
5463 #elif OIIO_SIMD_SSE
5464     lo().store (values);
5465     hi().store (values+4);
5466 #else
5467     SIMD_DO (values[i] = m_val[i]);
5468 #endif
5469 }
5470 
5471 
5472 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
5473 OIIO_FORCEINLINE vint8 shuffle (const vint8& a) {
5474 #if OIIO_SIMD_AVX >= 2
5475     vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
5476     return _mm256_castps_si256 (_mm256_permutevar8x32_ps (_mm256_castsi256_ps(a.simd()), index.simd()));
5477 #else
5478     return vint8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
5479 #endif
5480 }
5481 
5482 template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a) {
5483     return shuffle<i,i,i,i,i,i,i,i>(a);
5484 }
5485 
5486 
5487 template<int i>
5488 OIIO_FORCEINLINE int extract (const vint8& v) {
5489 #if OIIO_SIMD_AVX && !_WIN32
5490     return _mm256_extract_epi32(v.simd(), i);
5491 #else
5492     return v[i];
5493 #endif
5494 }
5495 
5496 
5497 template<int i>
5498 OIIO_FORCEINLINE vint8 insert (const vint8& a, int val) {
5499 #if OIIO_SIMD_AVX && !_WIN32
5500     return _mm256_insert_epi32 (a.simd(), val, i);
5501 #else
5502     vint8 tmp = a;
5503     tmp[i] = val;
5504     return tmp;
5505 #endif
5506 }
5507 
5508 
5509 OIIO_FORCEINLINE int vint8::x () const { return extract<0>(*this); }
5510 OIIO_FORCEINLINE int vint8::y () const { return extract<1>(*this); }
5511 OIIO_FORCEINLINE int vint8::z () const { return extract<2>(*this); }
5512 OIIO_FORCEINLINE int vint8::w () const { return extract<3>(*this); }
5513 OIIO_FORCEINLINE void vint8::set_x (int val) { *this = insert<0>(*this, val); }
5514 OIIO_FORCEINLINE void vint8::set_y (int val) { *this = insert<1>(*this, val); }
5515 OIIO_FORCEINLINE void vint8::set_z (int val) { *this = insert<2>(*this, val); }
5516 OIIO_FORCEINLINE void vint8::set_w (int val) { *this = insert<3>(*this, val); }
5517 
5518 
5519 OIIO_FORCEINLINE vint8 bitcast_to_int (const vbool8& x)
5520 {
5521 #if OIIO_SIMD_AVX
5522     return _mm256_castps_si256 (x.simd());
5523 #else
5524     return *(vint8 *)&x;
5525 #endif
5526 }
5527 
5528 
5529 OIIO_FORCEINLINE vint8 vreduce_add (const vint8& v) {
5530 #if OIIO_SIMD_AVX >= 2
5531     // From Syrah:
5532     vint8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_epi32(v.simd(), _mm256_setzero_si256());
5533     vint8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_epi32(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_si256());
5534     // get efgh in the 0-idx slot
5535     vint8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
5536     vint8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
5537     return shuffle<0>(final_sum);
5538 #elif OIIO_SIMD_SSE
5539     vint4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi());
5540     return vint8(hadd4, hadd4);
5541 #else
5542     return vint8(reduce_add(v));
5543 #endif
5544 }
5545 
5546 
5547 OIIO_FORCEINLINE int reduce_add (const vint8& v) {
5548 #if OIIO_SIMD_SSE
5549     return extract<0> (vreduce_add(v));
5550 #else
5551     return reduce_add(v.lo()) + reduce_add(v.hi());
5552 #endif
5553 }
5554 
5555 
5556 OIIO_FORCEINLINE int reduce_and (const vint8& v) {
5557 #if OIIO_SSE_AVX >= 2
5558     vint8 ab = v & shuffle<1,1,3,3,5,5,7,7>(v); // ab bb cd dd ef ff gh hh
5559     vint8 abcd = ab & shuffle<2,2,2,2,6,6,6,6>(ab); // abcd x x x efgh x x x
5560     vint8 abcdefgh = abcd & shuffle<4>(abcdefgh); // abcdefgh x x x x x x x
5561     return extract<0> (abcdefgh);
5562 #else
5563     // AVX 1.0 or less -- use SSE
5564     return reduce_and(v.lo() & v.hi());
5565 #endif
5566 }
5567 
5568 
5569 OIIO_FORCEINLINE int reduce_or (const vint8& v) {
5570 #if OIIO_SSE_AVX >= 2
5571     vint8 ab = v | shuffle<1,1,3,3,5,5,7,7>(v); // ab bb cd dd ef ff gh hh
5572     vint8 abcd = ab | shuffle<2,2,2,2,6,6,6,6>(ab); // abcd x x x efgh x x x
5573     vint8 abcdefgh = abcd | shuffle<4>(abcdefgh); // abcdefgh x x x x x x x
5574     return extract<0> (abcdefgh);
5575 #else
5576     // AVX 1.0 or less -- use SSE
5577     return reduce_or(v.lo() | v.hi());
5578 #endif
5579 }
5580 
5581 
5582 OIIO_FORCEINLINE vint8 blend (const vint8& a, const vint8& b, const vbool8& mask) {
5583 #if OIIO_SIMD_AVX
5584     return _mm256_castps_si256 (_mm256_blendv_ps (_mm256_castsi256_ps(a.simd()),
5585                                                   _mm256_castsi256_ps(b.simd()), mask));
5586 #elif OIIO_SIMD_SSE
5587     return vint8 (blend(a.lo(), b.lo(), mask.lo()),
5588                  blend(a.hi(), b.hi(), mask.hi()));
5589 #else
5590     SIMD_RETURN (vint8, mask[i] ? b[i] : a[i]);
5591 #endif
5592 }
5593 
5594 
5595 OIIO_FORCEINLINE vint8 blend0 (const vint8& a, const vbool8& mask) {
5596 // FIXME: More efficient for AVX-512 to use
5597 // _mm256_maxkz_mov_epi32(_mm256_movemask_ps(maxk),a))?
5598 #if OIIO_SIMD_AVX
5599     return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a.simd()), mask));
5600 #elif OIIO_SIMD_SSE
5601     return vint8 (blend0(a.lo(), mask.lo()),
5602                  blend0(a.hi(), mask.hi()));
5603 #else
5604     SIMD_RETURN (vint8, mask[i] ? a[i] : 0.0f);
5605 #endif
5606 }
5607 
5608 
5609 OIIO_FORCEINLINE vint8 blend0not (const vint8& a, const vbool8& mask) {
5610 // FIXME: More efficient for AVX-512 to use
5611 // _mm256_maxkz_mov_epi32(_mm256_movemask_ps(!maxk),a))?
5612 #if OIIO_SIMD_AVX
5613     return _mm256_castps_si256 (_mm256_andnot_ps (mask.simd(), _mm256_castsi256_ps(a.simd())));
5614 #elif OIIO_SIMD_SSE
5615     return vint8 (blend0not(a.lo(), mask.lo()),
5616                  blend0not(a.hi(), mask.hi()));
5617 #else
5618     SIMD_RETURN (vint8, mask[i] ? 0.0f : a[i]);
5619 #endif
5620 }
5621 
5622 OIIO_FORCEINLINE vint8 select (const vbool8& mask, const vint8& a, const vint8& b) {
5623     return blend (b, a, mask);
5624 }
5625 
5626 
5627 OIIO_FORCEINLINE vint8 abs (const vint8& a) {
5628 #if OIIO_SIMD_AVX >= 2
5629     return _mm256_abs_epi32(a.simd());
5630 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5631     return vint8(abs(a.lo()), abs(a.hi()));
5632 #else
5633     SIMD_RETURN (vint8, std::abs(a[i]));
5634 #endif
5635 }
5636 
5637 
5638 OIIO_FORCEINLINE vint8 min (const vint8& a, const vint8& b) {
5639 #if OIIO_SIMD_AVX >= 2
5640     return _mm256_min_epi32 (a, b);
5641 #else
5642     return vint8 (min(a.lo(), b.lo()), min(a.hi(), b.hi()));
5643 #endif
5644 }
5645 
5646 
5647 OIIO_FORCEINLINE vint8 max (const vint8& a, const vint8& b) {
5648 #if OIIO_SIMD_AVX >= 2
5649     return _mm256_max_epi32 (a, b);
5650 #else
5651     return vint8 (max(a.lo(), b.lo()), max(a.hi(), b.hi()));
5652 #endif
5653 }
5654 
5655 
5656 OIIO_FORCEINLINE vint8 rotl(const vint8& x, int s) {
5657 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5658     // return _mm256_rol_epi32 (x, s);
5659     // We want to do this ^^^ but this intrinsic only takes an *immediate*
5660     // argument for s, and there isn't a way to express in C++ that a
5661     // parameter must be an immediate/literal value from the caller.
5662     return (x<<s) | srl(x,32-s);
5663 #else
5664     return (x<<s) | srl(x,32-s);
5665 #endif
5666 }
5667 
5668 // DEPRECATED (2.1)
5669 OIIO_FORCEINLINE vint8 rotl32 (const vint8& x, const unsigned int k) {
5670     return rotl(x, k);
5671 }
5672 
5673 
5674 OIIO_FORCEINLINE vint8 andnot (const vint8& a, const vint8& b) {
5675 #if OIIO_SIMD_AVX >= 2
5676     return _mm256_andnot_si256 (a.simd(), b.simd());
5677 #elif OIIO_SIMD_AVX >= 1
5678     return _mm256_castps_si256 (_mm256_andnot_ps (_mm256_castsi256_ps(a.simd()), _mm256_castsi256_ps(b.simd())));
5679 #else
5680     SIMD_RETURN (vint8, ~(a[i]) & b[i]);
5681 #endif
5682 }
5683 
5684 
5685 // Implementation had to be after the definition of vint8::Zero.
5686 OIIO_FORCEINLINE vbool8::vbool8 (const vint8& ival) {
5687     m_simd = (ival != vint8::Zero());
5688 }
5689 
5690 
5691 
5692 OIIO_FORCEINLINE vint8 safe_mod (const vint8& a, const vint8& b) {
5693     // NO INTEGER MODULUS IN SSE!
5694     SIMD_RETURN (vint8, b[i] ? a[i] % b[i] : 0);
5695 }
5696 
5697 OIIO_FORCEINLINE vint8 safe_mod (const vint8& a, int b) {
5698     return b ? (a % b) : vint8::Zero();
5699 }
5700 
5701 
5702 
5703 
5704 //////////////////////////////////////////////////////////////////////
5705 // vint16 implementation
5706 
5707 OIIO_FORCEINLINE const vint16 & vint16::operator= (const vint16& other) {
5708     m_simd = other.m_simd;
5709     return *this;
5710 }
5711 
5712 OIIO_FORCEINLINE int vint16::operator[] (int i) const {
5713     OIIO_DASSERT(i<elements);
5714     return m_val[i];
5715 }
5716 
5717 OIIO_FORCEINLINE int& vint16::operator[] (int i) {
5718     OIIO_DASSERT(i<elements);
5719     return m_val[i];
5720 }
5721 
5722 OIIO_FORCEINLINE void vint16::setcomp (int i, int val) {
5723     OIIO_DASSERT(i<elements);
5724     m_val[i] = val;
5725 }
5726 
5727 
5728 OIIO_FORCEINLINE void vint16::load (int a) {
5729 #if OIIO_SIMD_AVX >= 512
5730     m_simd = _mm512_set1_epi32 (a);
5731 #else
5732     m_8[0].load (a);
5733     m_8[1].load (a);
5734 #endif
5735 }
5736 
5737 
5738 OIIO_FORCEINLINE void vint16::load (int v0, int v1, int v2, int v3,
5739                                    int v4, int v5, int v6, int v7,
5740                                    int v8, int v9, int v10, int v11,
5741                                    int v12, int v13, int v14, int v15) {
5742 #if OIIO_SIMD_AVX >= 512
5743     m_simd = _mm512_setr_epi32 (v0, v1, v2, v3, v4, v5, v6, v7,
5744                                 v8, v9, v10, v11, v12, v13, v14, v15);
5745 #else
5746     m_val[ 0] = v0;
5747     m_val[ 1] = v1;
5748     m_val[ 2] = v2;
5749     m_val[ 3] = v3;
5750     m_val[ 4] = v4;
5751     m_val[ 5] = v5;
5752     m_val[ 6] = v6;
5753     m_val[ 7] = v7;
5754     m_val[ 8] = v8;
5755     m_val[ 9] = v9;
5756     m_val[10] = v10;
5757     m_val[11] = v11;
5758     m_val[12] = v12;
5759     m_val[13] = v13;
5760     m_val[14] = v14;
5761     m_val[15] = v15;
5762 #endif
5763 }
5764 
5765 
5766 OIIO_FORCEINLINE void vint16::load (const int *values) {
5767 #if OIIO_SIMD_AVX >= 512
5768     m_simd = _mm512_loadu_si512 ((const simd_t *)values);
5769 #else
5770     m_8[0].load (values);
5771     m_8[1].load (values+8);
5772 #endif
5773 }
5774 
5775 
5776 OIIO_FORCEINLINE void vint16::load (const int *values, int n)
5777 {
5778 #if OIIO_SIMD_AVX >= 512
5779     m_simd = _mm512_maskz_loadu_epi32 (__mmask16(~(0xffff << n)), values);
5780 #else
5781     if (n > 8) {
5782         m_8[0].load (values);
5783         m_8[1].load (values+8, n-8);
5784     } else {
5785         m_8[0].load (values, n);
5786         m_8[1].clear ();
5787     }
5788 #endif
5789 }
5790 
5791 
5792 OIIO_FORCEINLINE void vint16::load (const short *values) {
5793 #if OIIO_SIMD_AVX >= 512
5794     m_simd = _mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)values));
5795 #else
5796     m_8[0].load (values);
5797     m_8[1].load (values+8);
5798 #endif
5799 }
5800 
5801 OIIO_FORCEINLINE void vint16::load (const unsigned short *values) {
5802 #if OIIO_SIMD_AVX >= 512
5803     m_simd = _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)values));
5804 #else
5805     m_8[0].load (values);
5806     m_8[1].load (values+8);
5807 #endif
5808 }
5809 
5810 
5811 OIIO_FORCEINLINE void vint16::load (const char *values) {
5812 #if OIIO_SIMD_AVX >= 512
5813     m_simd = _mm512_cvtepi8_epi32(_mm_loadu_si128((__m128i*)values));
5814 #else
5815     m_8[0].load (values);
5816     m_8[1].load (values+8);
5817 #endif
5818 }
5819 
5820 OIIO_FORCEINLINE void vint16::load (const unsigned char *values) {
5821 #if OIIO_SIMD_AVX >= 512
5822     m_simd = _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)values));
5823 #else
5824     m_8[0].load (values);
5825     m_8[1].load (values+8);
5826 #endif
5827 }
5828 
5829 
5830 OIIO_FORCEINLINE vint16::vint16 (int a) { load(a); }
5831 
5832 OIIO_FORCEINLINE vint16::vint16 (int v0, int v1, int v2, int v3,
5833                                int v4, int v5, int v6, int v7,
5834                                int v8, int v9, int v10, int v11,
5835                                int v12, int v13, int v14, int v15) {
5836     load (v0, v1, v2, v3, v4, v5, v6, v7,
5837           v8, v9, v10, v11, v12, v13, v14, v15);
5838 }
5839 
5840 OIIO_FORCEINLINE vint16::vint16 (const int *vals) { load (vals); }
5841 OIIO_FORCEINLINE vint16::vint16 (const unsigned short *vals) { load(vals); }
5842 OIIO_FORCEINLINE vint16::vint16 (const short *vals) { load(vals); }
5843 OIIO_FORCEINLINE vint16::vint16 (const unsigned char *vals) { load(vals); }
5844 OIIO_FORCEINLINE vint16::vint16 (const char *vals) { load(vals); }
5845 
5846 OIIO_FORCEINLINE const vint16 & vint16::operator= (int a) { load(a); return *this; }
5847 
5848 
5849 OIIO_FORCEINLINE void vint16::load_mask (const vbool16 &mask, const int *values) {
5850 #if OIIO_SIMD_AVX >= 512
5851     m_simd = _mm512_maskz_loadu_epi32 (mask, (const simd_t *)values);
5852 #else
5853     m_8[0].load_mask (mask.lo(), values);
5854     m_8[1].load_mask (mask.hi(), values+8);
5855 #endif
5856 }
5857 
5858 
5859 OIIO_FORCEINLINE void vint16::store_mask (const vbool16 &mask, int *values) const {
5860 #if OIIO_SIMD_AVX >= 512
5861     _mm512_mask_storeu_epi32 (values, mask.bitmask(), m_simd);
5862 #else
5863     lo().store_mask (mask.lo(), values);
5864     hi().store_mask (mask.hi(), values+8);
5865 #endif
5866 }
5867 
5868 
5869 template <int scale>
5870 OIIO_FORCEINLINE void
5871 vint16::gather (const value_t *baseptr, const vint_t& vindex) {
5872 #if OIIO_SIMD_AVX >= 512
5873     m_simd = _mm512_i32gather_epi32 (vindex, baseptr, scale);
5874 #else
5875     m_8[0].gather<scale> (baseptr, vindex.lo());
5876     m_8[1].gather<scale> (baseptr, vindex.hi());
5877 #endif
5878 }
5879 
5880 template<int scale>
5881 OIIO_FORCEINLINE void
5882 vint16::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex) {
5883 #if OIIO_SIMD_AVX >= 512
5884     m_simd = _mm512_mask_i32gather_epi32 (m_simd, mask, vindex, baseptr, scale);
5885 #else
5886     m_8[0].gather_mask<scale> (mask.lo(), baseptr, vindex.lo());
5887     m_8[1].gather_mask<scale> (mask.hi(), baseptr, vindex.hi());
5888 #endif
5889 }
5890 
5891 template<int scale>
5892 OIIO_FORCEINLINE void
5893 vint16::scatter (value_t *baseptr, const vint_t& vindex) const {
5894 #if OIIO_SIMD_AVX >= 512
5895     _mm512_i32scatter_epi32 (baseptr, vindex, m_simd, scale);
5896 #else
5897     lo().scatter<scale> (baseptr, vindex.lo());
5898     hi().scatter<scale> (baseptr, vindex.hi());
5899 #endif
5900 }
5901 
5902 template<int scale>
5903 OIIO_FORCEINLINE void
5904 vint16::scatter_mask (const bool_t& mask, value_t *baseptr,
5905                       const vint_t& vindex) const {
5906 #if OIIO_SIMD_AVX >= 512
5907     _mm512_mask_i32scatter_epi32 (baseptr, mask, vindex, m_simd, scale);
5908 #else
5909     lo().scatter_mask<scale> (mask.lo(), baseptr, vindex.lo());
5910     hi().scatter_mask<scale> (mask.hi(), baseptr, vindex.hi());
5911 #endif
5912 }
5913 
5914 
5915 OIIO_FORCEINLINE void vint16::store (int *values) const {
5916 #if OIIO_SIMD_AVX >= 512
5917     // Use an unaligned store -- it's just as fast when the memory turns
5918     // out to be aligned, nearly as fast even when unaligned. Not worth
5919     // the headache of using stores that require alignment.
5920     _mm512_storeu_si512 ((simd_t *)values, m_simd);
5921 #else
5922     lo().store (values);
5923     hi().store (values+8);
5924 #endif
5925 }
5926 
5927 
5928 OIIO_FORCEINLINE void vint16::clear () {
5929 #if OIIO_SIMD_AVX >= 512
5930     m_simd = _mm512_setzero_si512();
5931 #else
5932     *this = 0;
5933 #endif
5934 }
5935 
5936 
5937 OIIO_FORCEINLINE const vint16 vint16::Zero () {
5938 #if OIIO_SIMD_AVX >= 512
5939     return _mm512_setzero_epi32();
5940 #else
5941     return 0;
5942 #endif
5943 }
5944 
5945 OIIO_FORCEINLINE const vint16 vint16::One () { return vint16(1); }
5946 
5947 OIIO_FORCEINLINE const vint16 vint16::NegOne () { return vint16(-1); }
5948 
5949 
5950 OIIO_FORCEINLINE const vint16 vint16::Iota (int start, int step) {
5951     return vint16 (start+0*step, start+1*step, start+2*step, start+3*step,
5952                   start+4*step, start+5*step, start+6*step, start+7*step,
5953                   start+8*step, start+9*step, start+10*step, start+11*step,
5954                   start+12*step, start+13*step, start+14*step, start+15*step);
5955 }
5956 
5957 
5958 OIIO_FORCEINLINE const vint16 vint16::Giota () {
5959     return vint16 (1<<0, 1<<1, 1<<2, 1<<3,  1<<4, 1<<5, 1<<6, 1<<7,
5960                   1<<8, 1<<9, 1<<10, 1<<11,  1<<12, 1<<13, 1<<14, 1<<15);
5961 }
5962 
5963 
5964 OIIO_FORCEINLINE vint8 vint16::lo () const {
5965 #if OIIO_SIMD_AVX >= 512
5966     return _mm512_castsi512_si256 (simd());
5967 #else
5968     return m_8[0];
5969 #endif
5970 }
5971 
5972 OIIO_FORCEINLINE vint8 vint16::hi () const {
5973 #if OIIO_SIMD_AVX >= 512
5974     return _mm512_extracti64x4_epi64 (simd(), 1);
5975 #else
5976     return m_8[1];
5977 #endif
5978 }
5979 
5980 
5981 OIIO_FORCEINLINE vint16::vint16 (const vint8& lo, const vint8 &hi) {
5982 #if OIIO_SIMD_AVX >= 512
5983     __m512i r = _mm512_castsi256_si512 (lo);
5984     m_simd = _mm512_inserti32x8 (r, hi, 1);
5985 #else
5986     m_8[0] = lo;
5987     m_8[1] = hi;
5988 #endif
5989 }
5990 
5991 
5992 OIIO_FORCEINLINE vint16::vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d) {
5993 #if OIIO_SIMD_AVX >= 512
5994     m_simd = _mm512_broadcast_i32x4(a);
5995     m_simd = _mm512_inserti32x4 (m_simd, b, 1);
5996     m_simd = _mm512_inserti32x4 (m_simd, c, 2);
5997     m_simd = _mm512_inserti32x4 (m_simd, d, 3);
5998 #else
5999     m_8[0] = vint8(a,b);
6000     m_8[1] = vint8(c,d);
6001 #endif
6002 }
6003 
6004 
6005 OIIO_FORCEINLINE vint16 operator+ (const vint16& a, const vint16& b) {
6006 #if OIIO_SIMD_AVX >= 512
6007     return _mm512_add_epi32 (a.simd(), b.simd());
6008 #else
6009     return vint16 (a.lo()+b.lo(), a.hi()+b.hi());
6010 #endif
6011 }
6012 
6013 
6014 OIIO_FORCEINLINE const vint16& operator+= (vint16& a, const vint16& b) {
6015     return a = a + b;
6016 }
6017 
6018 
6019 OIIO_FORCEINLINE vint16 operator- (const vint16& a) {
6020 #if OIIO_SIMD_AVX >= 512
6021     return _mm512_sub_epi32 (_mm512_setzero_si512(), a);
6022 #else
6023     return vint16 (-a.lo(), -a.hi());
6024 #endif
6025 }
6026 
6027 
6028 OIIO_FORCEINLINE vint16 operator- (const vint16& a, const vint16& b) {
6029 #if OIIO_SIMD_AVX >= 512
6030     return _mm512_sub_epi32 (a.simd(), b.simd());
6031 #else
6032     return vint16 (a.lo()-b.lo(), a.hi()-b.hi());
6033 #endif
6034 }
6035 
6036 
6037 OIIO_FORCEINLINE const vint16 &operator-= (vint16& a, const vint16& b) {
6038     return a = a - b;
6039 }
6040 
6041 
6042 OIIO_FORCEINLINE vint16 operator* (const vint16& a, const vint16& b) {
6043 #if OIIO_SIMD_AVX >= 512
6044     return _mm512_mullo_epi32 (a.simd(), b.simd());
6045 #else
6046     return vint16 (a.lo()*b.lo(), a.hi()*b.hi());
6047 #endif
6048 }
6049 
6050 
6051 OIIO_FORCEINLINE const vint16& operator*= (vint16& a, const vint16& b) { return a = a * b; }
6052 OIIO_FORCEINLINE const vint16& operator*= (vint16& a, int b) { return a = a * b; }
6053 
6054 
6055 OIIO_FORCEINLINE vint16 operator/ (const vint16& a, const vint16& b) {
6056     // NO INTEGER DIVISION IN AVX512!
6057     SIMD_RETURN (vint16, a[i] / b[i]);
6058 }
6059 
6060 OIIO_FORCEINLINE const vint16& operator/= (vint16& a, const vint16& b) { return a = a / b; }
6061 
6062 
6063 OIIO_FORCEINLINE vint16 operator% (const vint16& a, const vint16& b) {
6064     // NO INTEGER MODULUS IN AVX512!
6065     SIMD_RETURN (vint16, a[i] % b[i]);
6066 }
6067 
6068 OIIO_FORCEINLINE const vint16& operator%= (vint16& a, const vint16& b) { return a = a % b; }
6069 
6070 OIIO_FORCEINLINE vint16 operator% (const vint16& a, int w) {
6071     // NO INTEGER MODULUS in AVX512!
6072     SIMD_RETURN (vint16, a[i] % w);
6073 }
6074 
6075 OIIO_FORCEINLINE const vint16& operator%= (vint16& a, int b) { return a = a % b; }
6076 
6077 
6078 OIIO_FORCEINLINE vint16 operator& (const vint16& a, const vint16& b) {
6079 #if OIIO_SIMD_AVX >= 512
6080     return _mm512_and_si512 (a.simd(), b.simd());
6081 #else
6082     return vint16 (a.lo() & b.lo(), a.hi() & b.hi());
6083 #endif
6084 }
6085 
6086 OIIO_FORCEINLINE const vint16& operator&= (vint16& a, const vint16& b) { return a = a & b; }
6087 
6088 OIIO_FORCEINLINE vint16 operator| (const vint16& a, const vint16& b) {
6089 #if OIIO_SIMD_AVX >= 512
6090     return _mm512_or_si512 (a.simd(), b.simd());
6091 #else
6092     return vint16 (a.lo() | b.lo(), a.hi() | b.hi());
6093 #endif
6094 }
6095 
6096 OIIO_FORCEINLINE const vint16& operator|= (vint16& a, const vint16& b) { return a = a | b; }
6097 
6098 OIIO_FORCEINLINE vint16 operator^ (const vint16& a, const vint16& b) {
6099 #if OIIO_SIMD_AVX >= 512
6100     return _mm512_xor_si512 (a.simd(), b.simd());
6101 #else
6102     return vint16 (a.lo() ^ b.lo(), a.hi() ^ b.hi());
6103 #endif
6104 }
6105 
6106 OIIO_FORCEINLINE const vint16& operator^= (vint16& a, const vint16& b) { return a = a ^ b; }
6107 
6108 
6109 OIIO_FORCEINLINE vint16 operator~ (const vint16& a) {
6110 #if OIIO_SIMD_AVX >= 512
6111     return a ^ a.NegOne();
6112 #else
6113     return vint16 (~a.lo(), ~a.hi());
6114 #endif
6115 }
6116 
6117 
6118 OIIO_FORCEINLINE vint16 operator<< (const vint16& a, const unsigned int bits) {
6119 #if OIIO_SIMD_AVX >= 512
6120     return _mm512_sllv_epi32 (a, vint16(int(bits)));
6121     // return _mm512_slli_epi32 (a, bits);
6122     // FIXME: can this be slli?
6123 #else
6124     return vint16 (a.lo() << bits, a.hi() << bits);
6125 #endif
6126 }
6127 
6128 
6129 OIIO_FORCEINLINE const vint16& operator<<= (vint16& a, const unsigned int bits) {
6130     return a = a << bits;
6131 }
6132 
6133 OIIO_FORCEINLINE vint16 operator>> (const vint16& a, const unsigned int bits) {
6134 #if OIIO_SIMD_AVX >= 512
6135     return _mm512_srav_epi32 (a, vint16(int(bits)));
6136     // FIXME: can this be srai?
6137 #else
6138     return vint16 (a.lo() >> bits, a.hi() >> bits);
6139 #endif
6140 }
6141 
6142 OIIO_FORCEINLINE const vint16& operator>>= (vint16& a, const unsigned int bits) {
6143     return a = a >> bits;
6144 }
6145 
6146 
6147 OIIO_FORCEINLINE vint16 srl (const vint16& a, const unsigned int bits) {
6148 #if OIIO_SIMD_AVX >= 512
6149     return _mm512_srlv_epi32 (a, vint16(int(bits)));
6150     // FIXME: can this be srli?
6151 #else
6152     return vint16 (srl(a.lo(), bits), srl (a.hi(), bits));
6153 #endif
6154 }
6155 
6156 
6157 OIIO_FORCEINLINE vbool16 operator== (const vint16& a, const vint16& b) {
6158 #if OIIO_SIMD_AVX >= 512
6159     return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 0 /*_MM_CMPINT_EQ*/);
6160 #else  /* Fall back to 8-wide */
6161     return vbool16 (a.lo() == b.lo(), a.hi() == b.hi());
6162 #endif
6163 }
6164 
6165 
6166 OIIO_FORCEINLINE vbool16 operator!= (const vint16& a, const vint16& b) {
6167 #if OIIO_SIMD_AVX >= 512
6168     return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 4 /*_MM_CMPINT_NEQ*/);
6169 #else  /* Fall back to 8-wide */
6170     return vbool16 (a.lo() != b.lo(), a.hi() != b.hi());
6171 #endif
6172 }
6173 
6174 
6175 OIIO_FORCEINLINE vbool16 operator> (const vint16& a, const vint16& b) {
6176 #if OIIO_SIMD_AVX >= 512
6177     return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 6 /*_MM_CMPINT_NLE*/);
6178 #else  /* Fall back to 8-wide */
6179     return vbool16 (a.lo() > b.lo(), a.hi() > b.hi());
6180 #endif
6181 }
6182 
6183 
6184 OIIO_FORCEINLINE vbool16 operator< (const vint16& a, const vint16& b) {
6185 #if OIIO_SIMD_AVX >= 512
6186     return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 1 /*_MM_CMPINT_LT*/);
6187 #else  /* Fall back to 8-wide */
6188     return vbool16 (a.lo() < b.lo(), a.hi() < b.hi());
6189 #endif
6190 }
6191 
6192 
6193 OIIO_FORCEINLINE vbool16 operator>= (const vint16& a, const vint16& b) {
6194 #if OIIO_SIMD_AVX >= 512
6195     return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 5 /*_MM_CMPINT_NLT*/);
6196 #else  /* Fall back to 8-wide */
6197     return vbool16 (a.lo() >= b.lo(), a.hi() >= b.hi());
6198 #endif
6199 }
6200 
6201 
6202 OIIO_FORCEINLINE vbool16 operator<= (const vint16& a, const vint16& b) {
6203 #if OIIO_SIMD_AVX >= 512
6204     return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 2 /*_MM_CMPINT_LE*/);
6205 #else  /* Fall back to 8-wide */
6206     return vbool16 (a.lo() <= b.lo(), a.hi() <= b.hi());
6207 #endif
6208 }
6209 
6210 
6211 inline std::ostream& operator<< (std::ostream& cout, const vint16& val) {
6212     cout << val[0];
6213     for (int i = 1; i < val.elements; ++i)
6214         cout << ' ' << val[i];
6215     return cout;
6216 }
6217 
6218 
6219 
6220 OIIO_FORCEINLINE void vint16::store (int *values, int n) const {
6221     OIIO_DASSERT (n >= 0 && n <= elements);
6222 #if 0 && OIIO_SIMD_AVX >= 512
6223     // This SHOULD be fast, but in my benchmarks, it is slower!
6224     // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
6225     // Re-test this periodically with new Intel hardware.
6226     _mm512_mask_storeu_epi32 (values, __mmask16(~(0xffff << n)), m_simd);
6227 #else
6228     if (n > 8) {
6229         m_8[0].store (values);
6230         m_8[1].store (values+8, n-8);
6231     } else {
6232         m_8[0].store (values, n);
6233     }
6234 #endif
6235 }
6236 
6237 
6238 OIIO_FORCEINLINE void vint16::store (unsigned short *values) const {
6239 #if OIIO_SIMD_AVX512
6240     _mm512_mask_cvtepi32_storeu_epi16 (values, __mmask16(0xff), m_simd);
6241 #elif OIIO_SIMD_AVX >= 2
6242     lo().store (values);
6243     hi().store (values+8);
6244 #else
6245     SIMD_DO (values[i] = m_val[i]);
6246 #endif
6247 }
6248 
6249 
6250 OIIO_FORCEINLINE void vint16::store (unsigned char *values) const {
6251 #if OIIO_SIMD_AVX512
6252     _mm512_mask_cvtepi32_storeu_epi8 (values, __mmask16(0xff), m_simd);
6253 #elif OIIO_SIMD_AVX >= 2
6254     lo().store (values);
6255     hi().store (values+8);
6256 #else
6257     SIMD_DO (values[i] = m_val[i]);
6258 #endif
6259 }
6260 
6261 
6262 
6263 // Shuffle groups of 4
6264 template<int i0, int i1, int i2, int i3>
6265 vint16 shuffle4 (const vint16& a) {
6266 #if OIIO_SIMD_AVX >= 512
6267     __m512 x = _mm512_castsi512_ps(a);
6268     return _mm512_castps_si512(_mm512_shuffle_f32x4(x,x,_MM_SHUFFLE(i3,i2,i1,i0)));
6269 #else
6270     vint4 x[4];
6271     a.store ((int *)x);
6272     return vint16 (x[i0], x[i1], x[i2], x[i3]);
6273 #endif
6274 }
6275 
6276 template<int i> vint16 shuffle4 (const vint16& a) {
6277     return shuffle4<i,i,i,i> (a);
6278 }
6279 
6280 template<int i0, int i1, int i2, int i3>
6281 vint16 shuffle (const vint16& a) {
6282 #if OIIO_SIMD_AVX >= 512
6283     __m512 x = _mm512_castsi512_ps(a);
6284     return _mm512_castps_si512(_mm512_permute_ps(x,_MM_SHUFFLE(i3,i2,i1,i0)));
6285 #else
6286     vint4 x[4];
6287     a.store ((int *)x);
6288     return vint16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]),
6289                   shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3]));
6290 #endif
6291 }
6292 
6293 template<int i> vint16 shuffle (const vint16& a) {
6294     return shuffle<i,i,i,i> (a);
6295 }
6296 
6297 
6298 template<int i>
6299 OIIO_FORCEINLINE int extract (const vint16& a) {
6300     return a[i];
6301 }
6302 
6303 
6304 template<int i>
6305 OIIO_FORCEINLINE vint16 insert (const vint16& a, int val) {
6306     vint16 tmp = a;
6307     tmp[i] = val;
6308     return tmp;
6309 }
6310 
6311 
6312 OIIO_FORCEINLINE int vint16::x () const {
6313 #if OIIO_SIMD_AVX >= 512
6314     return _mm_cvtsi128_si32(_mm512_castsi512_si128(m_simd));
6315 #else
6316     return m_val[0];
6317 #endif
6318 }
6319 
6320 OIIO_FORCEINLINE int vint16::y () const { return m_val[1]; }
6321 OIIO_FORCEINLINE int vint16::z () const { return m_val[2]; }
6322 OIIO_FORCEINLINE int vint16::w () const { return m_val[3]; }
6323 OIIO_FORCEINLINE void vint16::set_x (int val) { m_val[0] = val; }
6324 OIIO_FORCEINLINE void vint16::set_y (int val) { m_val[1] = val; }
6325 OIIO_FORCEINLINE void vint16::set_z (int val) { m_val[2] = val; }
6326 OIIO_FORCEINLINE void vint16::set_w (int val) { m_val[3] = val; }
6327 
6328 
6329 OIIO_FORCEINLINE vint16 bitcast_to_int (const vbool16& x)
6330 {
6331 #if OIIO_SIMD_AVX >= 512
6332     return _mm512_maskz_set1_epi32 (x, -1);
6333 #else
6334     return vint16 (bitcast_to_int(x.lo()), bitcast_to_int(x.hi()));
6335 #endif
6336 }
6337 
6338 
6339 OIIO_FORCEINLINE vint16 vreduce_add (const vint16& v) {
6340 #if OIIO_SIMD_AVX >= 512
6341     // Nomenclature: ABCD are the vint4's comprising v
6342     // First, add the vint4's and make them all the same
6343     vint16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(v);  // each adjacent vint4 is summed
6344     vint16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD);  // ABCD in all quads
6345     // Now, add within each vint4
6346     vint16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(w);  // each adjacent int is summed
6347     return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd);
6348 #else
6349     vint8 sum = vreduce_add(v.lo()) + vreduce_add(v.hi());
6350     return vint16 (sum, sum);
6351 #endif
6352 }
6353 
6354 
6355 OIIO_FORCEINLINE int reduce_add (const vint16& v) {
6356 #if OIIO_SIMD_AVX >= 512
6357     return vreduce_add(v).x();
6358 #else
6359     return reduce_add(v.lo()) + reduce_add(v.hi());
6360 #endif
6361 }
6362 
6363 
6364 OIIO_FORCEINLINE int reduce_and (const vint16& v) {
6365 #if OIIO_SIMD_AVX >= 512
6366     // Nomenclature: ABCD are the vint4's comprising v
6367     // First, and the vint4's and make them all the same
6368     vint16 AB_AB_CD_CD = v & shuffle4<1,0,3,2>(v);  // each adjacent vint4 is summed
6369     vint16 w = AB_AB_CD_CD & shuffle4<2,3,0,1>(AB_AB_CD_CD);
6370     // Now, and within each vint4
6371     vint16 ab_ab_cd_cd = w & shuffle<1,0,3,2>(w);  // each adjacent int is summed
6372     vint16 r = ab_ab_cd_cd & shuffle<2,3,0,1>(ab_ab_cd_cd);
6373     return r.x();
6374 #else
6375     return reduce_and(v.lo()) & reduce_and(v.hi());
6376 #endif
6377 }
6378 
6379 
6380 OIIO_FORCEINLINE int reduce_or (const vint16& v) {
6381 #if OIIO_SIMD_AVX >= 512
6382     // Nomenclature: ABCD are the vint4's comprising v
6383     // First, or the vint4's or make them all the same
6384     vint16 AB_AB_CD_CD = v | shuffle4<1,0,3,2>(v);  // each adjacent vint4 is summed
6385     vint16 w = AB_AB_CD_CD | shuffle4<2,3,0,1>(AB_AB_CD_CD);
6386     // Now, or within each vint4
6387     vint16 ab_ab_cd_cd = w | shuffle<1,0,3,2>(w);  // each adjacent int is summed
6388     vint16 r = ab_ab_cd_cd | shuffle<2,3,0,1>(ab_ab_cd_cd);
6389     return r.x();
6390 #else
6391     return reduce_or(v.lo()) | reduce_or(v.hi());
6392 #endif
6393 }
6394 
6395 
6396 
6397 OIIO_FORCEINLINE vint16 blend (const vint16& a, const vint16& b, const vbool16& mask) {
6398 #if OIIO_SIMD_AVX >= 512
6399     return _mm512_mask_blend_epi32 (mask, a, b);
6400 #else
6401     return vint16 (blend (a.lo(), b.lo(), mask.lo()),
6402                   blend (a.hi(), b.hi(), mask.hi()));
6403 #endif
6404 }
6405 
6406 
6407 OIIO_FORCEINLINE vint16 blend0 (const vint16& a, const vbool16& mask) {
6408 #if OIIO_SIMD_AVX >= 512
6409     return _mm512_maskz_mov_epi32 (mask, a);
6410 #else
6411     return vint16 (blend0 (a.lo(), mask.lo()),
6412                   blend0 (a.hi(), mask.hi()));
6413 #endif
6414 }
6415 
6416 
6417 OIIO_FORCEINLINE vint16 blend0not (const vint16& a, const vbool16& mask) {
6418 #if OIIO_SIMD_AVX >= 512
6419     return _mm512_maskz_mov_epi32 (!mask, a);
6420 #else
6421     return vint16 (blend0not (a.lo(), mask.lo()),
6422                   blend0not (a.hi(), mask.hi()));
6423 #endif
6424 }
6425 
6426 OIIO_FORCEINLINE vint16 select (const vbool16& mask, const vint16& a, const vint16& b) {
6427     return blend (b, a, mask);
6428 }
6429 
6430 
6431 OIIO_FORCEINLINE vint16 abs (const vint16& a) {
6432 #if OIIO_SIMD_AVX >= 512
6433     return _mm512_abs_epi32(a.simd());
6434 #else
6435     return vint16 (abs(a.lo()), abs(a.hi()));
6436 #endif
6437 }
6438 
6439 
6440 OIIO_FORCEINLINE vint16 min (const vint16& a, const vint16& b) {
6441 #if OIIO_SIMD_AVX >= 512
6442     return _mm512_min_epi32 (a, b);
6443 #else
6444     return vint16 (min(a.lo(), b.lo()), min(a.hi(), b.hi()));
6445 #endif
6446 }
6447 
6448 
6449 OIIO_FORCEINLINE vint16 max (const vint16& a, const vint16& b) {
6450 #if OIIO_SIMD_AVX >= 512
6451     return _mm512_max_epi32 (a, b);
6452 #else
6453     return vint16 (max(a.lo(), b.lo()), max(a.hi(), b.hi()));
6454 #endif
6455 }
6456 
6457 
6458 OIIO_FORCEINLINE vint16 rotl(const vint16& x, int s) {
6459 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6460     // return _mm512_rol_epi32 (x, s);
6461     // We want to do this ^^^ but this intrinsic only takes an *immediate*
6462     // argument for s, and there isn't a way to express in C++ that a
6463     // parameter must be an immediate/literal value from the caller.
6464     return (x<<s) | srl(x,32-s);
6465 #else
6466     return (x<<s) | srl(x,32-s);
6467 #endif
6468 }
6469 
6470 // DEPRECATED (2.1)
6471 OIIO_FORCEINLINE vint16 rotl32 (const vint16& x, const unsigned int k) {
6472     return rotl(x, k);
6473 }
6474 
6475 
6476 OIIO_FORCEINLINE vint16 andnot (const vint16& a, const vint16& b) {
6477 #if OIIO_SIMD_AVX >= 512
6478     return _mm512_andnot_epi32 (a.simd(), b.simd());
6479 #else
6480     return vint16 (andnot(a.lo(), b.lo()), andnot(a.hi(), b.hi()));
6481 #endif
6482 }
6483 
6484 
6485 
6486 OIIO_FORCEINLINE vint16 safe_mod (const vint16& a, const vint16& b) {
6487     // NO INTEGER MODULUS IN SSE!
6488     SIMD_RETURN (vint16, b[i] ? a[i] % b[i] : 0);
6489 }
6490 
6491 OIIO_FORCEINLINE vint16 safe_mod (const vint16& a, int b) {
6492     return b ? (a % b) : vint16::Zero();
6493 }
6494 
6495 
6496 
6497 
6498 
6499 //////////////////////////////////////////////////////////////////////
6500 // vfloat4 implementation
6501 
6502 
6503 OIIO_FORCEINLINE vfloat4::vfloat4 (const vint4& ival) {
6504 #if OIIO_SIMD_SSE
6505     m_simd = _mm_cvtepi32_ps (ival.simd());
6506 #elif OIIO_SIMD_NEON
6507     m_simd = vcvtq_f32_s32(ival.simd());
6508 #else
6509     SIMD_CONSTRUCT (float(ival[i]));
6510 #endif
6511 }
6512 
6513 
6514 OIIO_FORCEINLINE const vfloat4 vfloat4::Zero () {
6515 #if OIIO_SIMD_SSE
6516     return _mm_setzero_ps();
6517 #else
6518     return vfloat4(0.0f);
6519 #endif
6520 }
6521 
6522 OIIO_FORCEINLINE const vfloat4 vfloat4::One () {
6523     return vfloat4(1.0f);
6524 }
6525 
6526 OIIO_FORCEINLINE const vfloat4 vfloat4::Iota (float start, float step) {
6527     return vfloat4 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step);
6528 }
6529 
6530 /// Set all components to 0.0
6531 OIIO_FORCEINLINE void vfloat4::clear () {
6532 #if OIIO_SIMD_SSE
6533     m_simd = _mm_setzero_ps();
6534 #else
6535     load (0.0f);
6536 #endif
6537 }
6538 
6539 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator= (const Imath::V4f &v) {
6540     load ((const float *)&v);
6541     return *this;
6542 }
6543 
6544 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator= (const Imath::V3f &v) {
6545     load (v[0], v[1], v[2], 0.0f);
6546     return *this;
6547 }
6548 
6549 OIIO_FORCEINLINE float& vfloat4::operator[] (int i) {
6550     OIIO_DASSERT(i<elements);
6551     return m_val[i];
6552 }
6553 
6554 OIIO_FORCEINLINE float vfloat4::operator[] (int i) const {
6555     OIIO_DASSERT(i<elements);
6556     return m_val[i];
6557 }
6558 
6559 
6560 OIIO_FORCEINLINE void vfloat4::load (float val) {
6561 #if OIIO_SIMD_SSE
6562     m_simd = _mm_set1_ps (val);
6563 #elif OIIO_SIMD_NEON
6564     m_simd = vdupq_n_f32 (val);
6565 #else
6566     SIMD_CONSTRUCT (val);
6567 #endif
6568 }
6569 
6570 OIIO_FORCEINLINE void vfloat4::load (float a, float b, float c, float d) {
6571 #if OIIO_SIMD_SSE
6572     m_simd = _mm_set_ps (d, c, b, a);
6573 #elif OIIO_SIMD_NEON
6574     float values[4] = { a, b, c, d };
6575     m_simd = vld1q_f32 (values);
6576 #else
6577     m_val[0] = a;
6578     m_val[1] = b;
6579     m_val[2] = c;
6580     m_val[3] = d;
6581 #endif
6582 }
6583 
6584     /// Load from an array of 4 values
6585 OIIO_FORCEINLINE void vfloat4::load (const float *values) {
6586 #if OIIO_SIMD_SSE
6587     m_simd = _mm_loadu_ps (values);
6588 #elif OIIO_SIMD_NEON
6589     m_simd = vld1q_f32 (values);
6590 #else
6591     SIMD_CONSTRUCT (values[i]);
6592 #endif
6593 }
6594 
6595 
6596 OIIO_FORCEINLINE void vfloat4::load (const float *values, int n) {
6597     OIIO_DASSERT (n >= 0 && n <= elements);
6598 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6599     m_simd = _mm_maskz_loadu_ps (__mmask8(~(0xf << n)), values);
6600 #elif OIIO_SIMD_SSE
6601     switch (n) {
6602     case 1:
6603         m_simd = _mm_load_ss (values);
6604         break;
6605     case 2:
6606         // Trickery: load one double worth of bits!
6607         m_simd = _mm_castpd_ps (_mm_load_sd ((const double*)values));
6608         break;
6609     case 3:
6610         m_simd = _mm_setr_ps (values[0], values[1], values[2], 0.0f);
6611         // This looks wasteful, but benchmarks show that it's the
6612         // fastest way to set 3 values with the 4th getting zero.
6613         // Actually, gcc and clang both turn it into something more
6614         // efficient than _mm_setr_ps. The version below looks smart,
6615         // but was much more expensive as the _mm_setr_ps!
6616         //   __m128 xy = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)values));
6617         //   m_simd = _mm_movelh_ps(xy, _mm_load_ss (values + 2));
6618         break;
6619     case 4:
6620         m_simd = _mm_loadu_ps (values);
6621         break;
6622     default:
6623         clear();
6624         break;
6625     }
6626 #elif OIIO_SIMD_NEON
6627     switch (n) {
6628     case 1: m_simd = vdupq_n_f32(0); m_simd[0] = values[0]; break;
6629     case 2: load (values[0], values[1], 0.0f, 0.0f);      break;
6630     case 3: load (values[0], values[1], values[2], 0.0f); break;
6631     case 4: m_simd = vld1q_f32 (values);                   break;
6632     default: break;
6633     }
6634 #else
6635     for (int i = 0; i < n; ++i)
6636         m_val[i] = values[i];
6637     for (int i = n; i < paddedelements; ++i)
6638         m_val[i] = 0;
6639 #endif
6640 }
6641 
6642 
6643 OIIO_FORCEINLINE void vfloat4::load (const unsigned short *values) {
6644 #if OIIO_SIMD_SSE >= 2
6645     m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6646     // You might guess that the following is faster, but it's NOT:
6647     //   NO!  m_simd = _mm_cvtpu16_ps (*(__m64*)values);
6648 #else
6649     SIMD_CONSTRUCT (values[i]);
6650 #endif
6651 }
6652 
6653 
6654 OIIO_FORCEINLINE void vfloat4::load (const short *values) {
6655 #if OIIO_SIMD_SSE >= 2
6656     m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6657 #else
6658     SIMD_CONSTRUCT (values[i]);
6659 #endif
6660 }
6661 
6662 
6663 OIIO_FORCEINLINE void vfloat4::load (const unsigned char *values) {
6664 #if OIIO_SIMD_SSE >= 2
6665     m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6666 #else
6667     SIMD_CONSTRUCT (values[i]);
6668 #endif
6669 }
6670 
6671 // Load from an array of 4 char values, convert to float
6672 OIIO_FORCEINLINE void vfloat4::load (const char *values) {
6673 #if OIIO_SIMD_SSE >= 2
6674     m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6675 #else
6676     SIMD_CONSTRUCT (values[i]);
6677 #endif
6678 }
6679 
6680 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
6681 OIIO_FORCEINLINE void vfloat4::load (const half *values) {
6682 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE
6683     /* Enabled 16 bit float instructions! */
6684     __m128i a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
6685     m_simd = _mm_cvtph_ps (a);
6686 #elif OIIO_SIMD_SSE >= 2
6687     // SSE half-to-float by Fabian "ryg" Giesen. Public domain.
6688     // https://gist.github.com/rygorous/2144712
6689     vint4 h ((const unsigned short *)values);
6690 # define CONSTI(name) *(const __m128i *)&name
6691 # define CONSTF(name) *(const __m128 *)&name
6692     OIIO_SIMD_UINT4_CONST(mask_nosign, 0x7fff);
6693     OIIO_SIMD_UINT4_CONST(magic,       (254 - 15) << 23);
6694     OIIO_SIMD_UINT4_CONST(was_infnan,  0x7bff);
6695     OIIO_SIMD_UINT4_CONST(exp_infnan,  255 << 23);
6696     __m128i mnosign     = CONSTI(mask_nosign);
6697     __m128i expmant     = _mm_and_si128(mnosign, h);
6698     __m128i justsign    = _mm_xor_si128(h, expmant);
6699     __m128i expmant2    = expmant; // copy (just here for counting purposes)
6700     __m128i shifted     = _mm_slli_epi32(expmant, 13);
6701     __m128  scaled      = _mm_mul_ps(_mm_castsi128_ps(shifted), *(const __m128 *)&magic);
6702     __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, CONSTI(was_infnan));
6703     __m128i sign        = _mm_slli_epi32(justsign, 16);
6704     __m128  infnanexp   = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), CONSTF(exp_infnan));
6705     __m128  sign_inf    = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp);
6706     __m128  final       = _mm_or_ps(scaled, sign_inf);
6707     // ~11 SSE2 ops.
6708     m_simd = final;
6709 # undef CONSTI
6710 # undef CONSTF
6711 #else /* No SIMD defined: */
6712     SIMD_CONSTRUCT (values[i]);
6713 #endif
6714 }
6715 #endif /* _HALF_H_ or _IMATH_H_ */
6716 
6717 OIIO_FORCEINLINE void vfloat4::store (float *values) const {
6718 #if OIIO_SIMD_SSE
6719     // Use an unaligned store -- it's just as fast when the memory turns
6720     // out to be aligned, nearly as fast even when unaligned. Not worth
6721     // the headache of using stores that require alignment.
6722     _mm_storeu_ps (values, m_simd);
6723 #elif OIIO_SIMD_NEON
6724     vst1q_f32 (values, m_simd);
6725 #else
6726     SIMD_DO (values[i] = m_val[i]);
6727 #endif
6728 }
6729 
6730 OIIO_FORCEINLINE void vfloat4::store (float *values, int n) const {
6731     OIIO_DASSERT (n >= 0 && n <= 4);
6732 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6733     // This SHOULD be fast, but in my benchmarks, it is slower!
6734     // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
6735     // Re-test this periodically with new Intel hardware.
6736     _mm_mask_storeu_ps (values, __mmask8(~(0xf << n)), m_simd);
6737 #elif OIIO_SIMD_SSE
6738     switch (n) {
6739         case 1:
6740         _mm_store_ss (values, m_simd);
6741         break;
6742     case 2:
6743         // Trickery: store two floats as a double worth of bits
6744         _mm_store_sd ((double*)values, _mm_castps_pd(m_simd));
6745         break;
6746     case 3:
6747         values[0] = m_val[0];
6748         values[1] = m_val[1];
6749         values[2] = m_val[2];
6750         // This looks wasteful, but benchmarks show that it's the
6751         // fastest way to store 3 values, in benchmarks was faster than
6752         // this, below:
6753         //   _mm_store_sd ((double*)values, _mm_castps_pd(m_simd));
6754         //   _mm_store_ss (values + 2, _mm_movehl_ps(m_simd,m_simd));
6755         break;
6756     case 4:
6757         store (values);
6758         break;
6759     default:
6760         break;
6761     }
6762 #elif OIIO_SIMD_NEON
6763     switch (n) {
6764     case 1:
6765         vst1q_lane_f32 (values, m_simd, 0);
6766         break;
6767     case 2:
6768         vst1q_lane_f32 (values++, m_simd, 0);
6769         vst1q_lane_f32 (values, m_simd, 1);
6770         break;
6771     case 3:
6772         vst1q_lane_f32 (values++, m_simd, 0);
6773         vst1q_lane_f32 (values++, m_simd, 1);
6774         vst1q_lane_f32 (values, m_simd, 2);
6775         break;
6776     case 4:
6777         vst1q_f32 (values, m_simd); break;
6778     default:
6779         break;
6780     }
6781 #else
6782     for (int i = 0; i < n; ++i)
6783         values[i] = m_val[i];
6784 #endif
6785 }
6786 
6787 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
6788 OIIO_FORCEINLINE void vfloat4::store (half *values) const {
6789 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE
6790     __m128i h = _mm_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
6791     _mm_store_sd ((double *)values, _mm_castsi128_pd(h));
6792 #else
6793     SIMD_DO (values[i] = m_val[i]);
6794 #endif
6795 }
6796 #endif
6797 
6798 
6799 OIIO_FORCEINLINE void vfloat4::load_mask (int mask, const float *values) {
6800 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6801     m_simd = _mm_maskz_loadu_ps (__mmask8(mask), (const simd_t *)values);
6802 #elif OIIO_SIMD_AVX
6803     m_simd = _mm_maskload_ps (values, _mm_castps_si128(vbool_t::from_bitmask(mask)));
6804 #else
6805     SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0.0f);
6806 #endif
6807 }
6808 
6809 
6810 OIIO_FORCEINLINE void vfloat4::load_mask (const vbool_t& mask, const float *values) {
6811 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6812     m_simd = _mm_maskz_loadu_ps (__mmask8(mask.bitmask()), (const simd_t *)values);
6813 #elif OIIO_SIMD_AVX
6814     m_simd = _mm_maskload_ps (values, _mm_castps_si128(mask));
6815 #else
6816     SIMD_CONSTRUCT (mask[i] ? values[i] : 0.0f);
6817 #endif
6818 }
6819 
6820 
6821 OIIO_FORCEINLINE void vfloat4::store_mask (int mask, float *values) const {
6822 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6823     _mm_mask_storeu_ps (values, __mmask8(mask), m_simd);
6824 #elif OIIO_SIMD_AVX
6825     _mm_maskstore_ps (values, _mm_castps_si128(vbool_t::from_bitmask(mask)), m_simd);
6826 #else
6827     SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
6828 #endif
6829 }
6830 
6831 
6832 OIIO_FORCEINLINE void vfloat4::store_mask (const vbool_t& mask, float *values) const {
6833 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6834     _mm_mask_storeu_ps (values, __mmask8(mask.bitmask()), m_simd);
6835 #elif OIIO_SIMD_AVX
6836     _mm_maskstore_ps (values, _mm_castps_si128(mask.simd()), m_simd);
6837 #else
6838     SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
6839 #endif
6840 }
6841 
6842 
6843 template <int scale>
6844 OIIO_FORCEINLINE void
6845 vfloat4::gather (const value_t *baseptr, const vint_t& vindex)
6846 {
6847 #if OIIO_SIMD_AVX >= 2
6848     m_simd = _mm_i32gather_ps (baseptr, vindex, scale);
6849 #else
6850     SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
6851 #endif
6852 }
6853 
6854 template<int scale>
6855 OIIO_FORCEINLINE void
6856 vfloat4::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex)
6857 {
6858 #if OIIO_SIMD_AVX >= 2
6859     m_simd = _mm_mask_i32gather_ps (m_simd, baseptr, vindex, mask, scale);
6860 #else
6861     SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0);
6862 #endif
6863 }
6864 
6865 template<int scale>
6866 OIIO_FORCEINLINE void
6867 vfloat4::scatter (value_t *baseptr, const vint_t& vindex) const
6868 {
6869 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6870     // FIXME: disable because it benchmarks slower than the dumb way
6871     _mm_i32scatter_ps (baseptr, vindex, m_simd, scale);
6872 #else
6873     SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
6874 #endif
6875 }
6876 
6877 template<int scale>
6878 OIIO_FORCEINLINE void
6879 vfloat4::scatter_mask (const bool_t& mask, value_t *baseptr,
6880                        const vint_t& vindex) const
6881 {
6882 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6883     // FIXME: disable because it benchmarks slower than the dumb way
6884     _mm_mask_i32scatter_ps (baseptr, mask.bitmask(), vindex, m_simd, scale);
6885 #else
6886     SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
6887 #endif
6888 }
6889 
6890 
6891 OIIO_FORCEINLINE vfloat4 operator+ (const vfloat4& a, const vfloat4& b) {
6892 #if OIIO_SIMD_SSE
6893     return _mm_add_ps (a.m_simd, b.m_simd);
6894 #elif OIIO_SIMD_NEON
6895     return vaddq_f32 (a.m_simd, b.m_simd);
6896 #else
6897     SIMD_RETURN (vfloat4, a[i] + b[i]);
6898 #endif
6899 }
6900 
6901 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator+= (const vfloat4& a) {
6902 #if OIIO_SIMD_SSE
6903     m_simd = _mm_add_ps (m_simd, a.m_simd);
6904 #elif OIIO_SIMD_NEON
6905     m_simd = vaddq_f32 (m_simd, a.m_simd);
6906 #else
6907     SIMD_DO (m_val[i] += a[i]);
6908 #endif
6909     return *this;
6910     }
6911 
6912 OIIO_FORCEINLINE vfloat4 vfloat4::operator- () const {
6913 #if OIIO_SIMD_SSE
6914     return _mm_sub_ps (_mm_setzero_ps(), m_simd);
6915 #elif OIIO_SIMD_NEON
6916     return vsubq_f32 (Zero(), m_simd);
6917 #else
6918     SIMD_RETURN (vfloat4, -m_val[i]);
6919 #endif
6920 }
6921 
6922 OIIO_FORCEINLINE vfloat4 operator- (const vfloat4& a, const vfloat4& b) {
6923 #if OIIO_SIMD_SSE
6924     return _mm_sub_ps (a.m_simd, b.m_simd);
6925 #elif OIIO_SIMD_NEON
6926     return vsubq_f32 (a.m_simd, b.m_simd);
6927 #else
6928     SIMD_RETURN (vfloat4, a[i] - b[i]);
6929 #endif
6930 }
6931 
6932 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator-= (const vfloat4& a) {
6933 #if OIIO_SIMD_SSE
6934     m_simd = _mm_sub_ps (m_simd, a.m_simd);
6935 #elif OIIO_SIMD_NEON
6936     m_simd = vsubq_f32 (m_simd, a.m_simd);
6937 #else
6938     SIMD_DO (m_val[i] -= a[i]);
6939 #endif
6940     return *this;
6941 }
6942 
6943 OIIO_FORCEINLINE vfloat4 operator* (const vfloat4& a, float b) {
6944 #if OIIO_SIMD_SSE
6945     return _mm_mul_ps (a.m_simd, _mm_set1_ps(b));
6946 #elif OIIO_SIMD_NEON
6947     return vmulq_n_f32 (a.m_simd, b);
6948 #else
6949     SIMD_RETURN (vfloat4, a[i] * b);
6950 #endif
6951 }
6952 
6953 OIIO_FORCEINLINE vfloat4 operator* (float a, const vfloat4& b) {
6954     return b * a;
6955 }
6956 
6957 OIIO_FORCEINLINE vfloat4 operator* (const vfloat4& a, const vfloat4& b) {
6958 #if OIIO_SIMD_SSE
6959     return _mm_mul_ps (a.m_simd, b.m_simd);
6960 #elif OIIO_SIMD_NEON
6961     return vmulq_f32 (a.m_simd, b.m_simd);
6962 #else
6963     SIMD_RETURN (vfloat4, a[i] * b[i]);
6964 #endif
6965 }
6966 
6967 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator*= (const vfloat4& a) {
6968 #if OIIO_SIMD_SSE
6969     m_simd = _mm_mul_ps (m_simd, a.m_simd);
6970 #elif OIIO_SIMD_NEON
6971     m_simd = vmulq_f32 (m_simd, a.m_simd);
6972 #else
6973     SIMD_DO (m_val[i] *= a[i]);
6974 #endif
6975     return *this;
6976 }
6977 
6978 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator*= (float val) {
6979 #if OIIO_SIMD_SSE
6980     m_simd = _mm_mul_ps (m_simd, _mm_set1_ps(val));
6981 #elif OIIO_SIMD_NEON
6982     m_simd = vmulq_n_f32 (m_simd, val);
6983 #else
6984     SIMD_DO (m_val[i] *= val);
6985 #endif
6986     return *this;
6987 }
6988 
6989 OIIO_FORCEINLINE vfloat4 operator/ (const vfloat4& a, const vfloat4& b) {
6990 #if OIIO_SIMD_SSE
6991     return _mm_div_ps (a.m_simd, b.m_simd);
6992 #elif OIIO_SIMD_NEON
6993     return vdivq_f32 (a.m_simd, b.m_simd);
6994 #else
6995     SIMD_RETURN (vfloat4, a[i] / b[i]);
6996 #endif
6997 }
6998 
6999 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator/= (const vfloat4& a) {
7000 #if OIIO_SIMD_SSE
7001     m_simd = _mm_div_ps (m_simd, a.m_simd);
7002 #elif OIIO_SIMD_NEON
7003     m_simd = vdivq_f32 (m_simd, a.m_simd);
7004 #else
7005     SIMD_DO (m_val[i] /= a[i]);
7006 #endif
7007     return *this;
7008 }
7009 
7010 OIIO_FORCEINLINE const vfloat4 & vfloat4::operator/= (float val) {
7011 #if OIIO_SIMD_SSE
7012     m_simd = _mm_div_ps (m_simd, _mm_set1_ps(val));
7013 #elif OIIO_SIMD_NEON
7014     m_simd = vdivq_f32 (m_simd, vfloat4(val));
7015 #else
7016     SIMD_DO (m_val[i] /= val);
7017 #endif
7018     return *this;
7019 }
7020 
7021 OIIO_FORCEINLINE vbool4 operator== (const vfloat4& a, const vfloat4& b) {
7022 #if OIIO_SIMD_SSE
7023     return _mm_cmpeq_ps (a.m_simd, b.m_simd);
7024 #elif OIIO_SIMD_NEON
7025     return vceqq_f32 (a.m_simd, b.m_simd);
7026 #else
7027     SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
7028 #endif
7029 }
7030 
7031 OIIO_FORCEINLINE vbool4 operator!= (const vfloat4& a, const vfloat4& b) {
7032 #if OIIO_SIMD_SSE
7033     return _mm_cmpneq_ps (a.m_simd, b.m_simd);
7034 #elif OIIO_SIMD_NEON
7035     // implemented as NOT(a == b)
7036     return vmvnq_u32(vceqq_f32 (a.m_simd, b.m_simd));
7037 #else
7038     SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0);
7039 #endif
7040 }
7041 
7042 OIIO_FORCEINLINE vbool4 operator< (const vfloat4& a, const vfloat4& b) {
7043 #if OIIO_SIMD_SSE
7044     return _mm_cmplt_ps (a.m_simd, b.m_simd);
7045 #elif OIIO_SIMD_NEON
7046     return vcltq_f32 (a.m_simd, b.m_simd);
7047 #else
7048     SIMD_RETURN (vbool4, a[i] < b[i] ? -1 : 0);
7049 #endif
7050 }
7051 
7052 OIIO_FORCEINLINE vbool4 operator> (const vfloat4& a, const vfloat4& b) {
7053 #if OIIO_SIMD_SSE
7054     return _mm_cmpgt_ps (a.m_simd, b.m_simd);
7055 #elif OIIO_SIMD_NEON
7056     return vcgtq_f32 (a.m_simd, b.m_simd);
7057 #else
7058     SIMD_RETURN (vbool4, a[i] > b[i] ? -1 : 0);
7059 #endif
7060 }
7061 
7062 OIIO_FORCEINLINE vbool4 operator>= (const vfloat4& a, const vfloat4& b) {
7063 #if OIIO_SIMD_SSE
7064     return _mm_cmpge_ps (a.m_simd, b.m_simd);
7065 #elif OIIO_SIMD_NEON
7066     return vcgeq_f32 (a.m_simd, b.m_simd);
7067 #else
7068     SIMD_RETURN (vbool4, a[i] >= b[i] ? -1 : 0);
7069 #endif
7070 }
7071 
7072 OIIO_FORCEINLINE vbool4 operator<= (const vfloat4& a, const vfloat4& b) {
7073 #if OIIO_SIMD_SSE
7074     return _mm_cmple_ps (a.m_simd, b.m_simd);
7075 #elif OIIO_SIMD_NEON
7076     return vcleq_f32 (a.m_simd, b.m_simd);
7077 #else
7078     SIMD_RETURN (vbool4, a[i] <= b[i] ? -1 : 0);
7079 #endif
7080 }
7081 
7082 OIIO_FORCEINLINE vfloat4 AxyBxy (const vfloat4& a, const vfloat4& b) {
7083 #if OIIO_SIMD_SSE
7084     return _mm_movelh_ps (a.m_simd, b.m_simd);
7085 #else
7086     return vfloat4 (a[0], a[1], b[0], b[1]);
7087 #endif
7088 }
7089 
7090 OIIO_FORCEINLINE vfloat4 AxBxAyBy (const vfloat4& a, const vfloat4& b) {
7091 #if OIIO_SIMD_SSE
7092     return _mm_unpacklo_ps (a.m_simd, b.m_simd);
7093 #else
7094     return vfloat4 (a[0], b[0], a[1], b[1]);
7095 #endif
7096 }
7097 
7098 OIIO_FORCEINLINE vfloat4 vfloat4::xyz0 () const {
7099     return insert<3>(*this, 0.0f);
7100 }
7101 
7102 OIIO_FORCEINLINE vfloat4 vfloat4::xyz1 () const {
7103     return insert<3>(*this, 1.0f);
7104 }
7105 
7106 inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val) {
7107     cout << val[0];
7108     for (int i = 1; i < val.elements; ++i)
7109         cout << ' ' << val[i];
7110     return cout;
7111 }
7112 
7113 
7114 // Implementation had to be after the definition of vfloat4.
7115 OIIO_FORCEINLINE vint4::vint4 (const vfloat4& f)
7116 {
7117 #if OIIO_SIMD_SSE
7118     m_simd = _mm_cvttps_epi32(f.simd());
7119 #else
7120     SIMD_CONSTRUCT ((int) f[i]);
7121 #endif
7122 }
7123 
7124 
7125 template<int i0, int i1, int i2, int i3>
7126 OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) {
7127 #if OIIO_SIMD_SSE
7128     return shuffle_sse<i0,i1,i2,i3> (__m128(a));
7129 #else
7130     return vfloat4(a[i0], a[i1], a[i2], a[i3]);
7131 #endif
7132 }
7133 
7134 template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { return shuffle<i,i,i,i>(a); }
7135 
7136 #if OIIO_SIMD_NEON
7137 template<> OIIO_FORCEINLINE vfloat4 shuffle<0> (const vfloat4& a) {
7138     float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,0);
7139 }
7140 template<> OIIO_FORCEINLINE vfloat4 shuffle<1> (const vfloat4& a) {
7141     float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,1);
7142 }
7143 template<> OIIO_FORCEINLINE vfloat4 shuffle<2> (const vfloat4& a) {
7144     float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,0);
7145 }
7146 template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
7147     float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,1);
7148 }
7149 #endif
7150 
7151 
7152 
7153 /// Helper: as rapid as possible extraction of one component, when the
7154 /// index is fixed.
7155 template<int i>
7156 OIIO_FORCEINLINE float extract (const vfloat4& a) {
7157 #if OIIO_SIMD_SSE
7158     return _mm_cvtss_f32(shuffle_sse<i,i,i,i>(a.simd()));
7159 #else
7160     return a[i];
7161 #endif
7162 }
7163 
7164 #if OIIO_SIMD_SSE
7165 template<> OIIO_FORCEINLINE float extract<0> (const vfloat4& a) {
7166     return _mm_cvtss_f32(a.simd());
7167 }
7168 #endif
7169 
7170 
7171 /// Helper: substitute val for a[i]
7172 template<int i>
7173 OIIO_FORCEINLINE vfloat4 insert (const vfloat4& a, float val) {
7174 #if OIIO_SIMD_SSE >= 4
7175     return _mm_insert_ps (a, _mm_set_ss(val), i<<4);
7176 #else
7177     vfloat4 tmp = a;
7178     tmp[i] = val;
7179     return tmp;
7180 #endif
7181 }
7182 
7183 #if OIIO_SIMD_SSE
7184 // Slightly faster special cases for SSE
7185 template<> OIIO_FORCEINLINE vfloat4 insert<0> (const vfloat4& a, float val) {
7186     return _mm_move_ss (a.simd(), _mm_set_ss(val));
7187 }
7188 #endif
7189 
7190 
7191 OIIO_FORCEINLINE float vfloat4::x () const { return extract<0>(*this); }
7192 OIIO_FORCEINLINE float vfloat4::y () const { return extract<1>(*this); }
7193 OIIO_FORCEINLINE float vfloat4::z () const { return extract<2>(*this); }
7194 OIIO_FORCEINLINE float vfloat4::w () const { return extract<3>(*this); }
7195 OIIO_FORCEINLINE void vfloat4::set_x (float val) { *this = insert<0>(*this, val); }
7196 OIIO_FORCEINLINE void vfloat4::set_y (float val) { *this = insert<1>(*this, val); }
7197 OIIO_FORCEINLINE void vfloat4::set_z (float val) { *this = insert<2>(*this, val); }
7198 OIIO_FORCEINLINE void vfloat4::set_w (float val) { *this = insert<3>(*this, val); }
7199 
7200 
7201 OIIO_FORCEINLINE vint4 bitcast_to_int (const vfloat4& x)
7202 {
7203 #if OIIO_SIMD_SSE
7204     return _mm_castps_si128 (x.simd());
7205 #else
7206     return *(vint4 *)&x;
7207 #endif
7208 }
7209 
7210 OIIO_FORCEINLINE vfloat4 bitcast_to_float (const vint4& x)
7211 {
7212 #if OIIO_SIMD_SSE
7213     return _mm_castsi128_ps (x.simd());
7214 #else
7215     return *(vfloat4 *)&x;
7216 #endif
7217 }
7218 
7219 
7220 // Old names:
7221 inline vint4 bitcast_to_int4 (const vfloat4& x) { return bitcast_to_int(x); }
7222 inline vfloat4 bitcast_to_float4 (const vint4& x) { return bitcast_to_float(x); }
7223 
7224 
7225 
7226 OIIO_FORCEINLINE vfloat4 vreduce_add (const vfloat4& v) {
7227 #if OIIO_SIMD_SSE >= 3
7228     // People seem to agree that SSE3 does add reduction best with 2
7229     // horizontal adds.
7230     // suppose v = (a, b, c, d)
7231     simd::vfloat4 ab_cd = _mm_hadd_ps (v.simd(), v.simd());
7232     // ab_cd = (a+b, c+d, a+b, c+d)
7233     simd::vfloat4 abcd = _mm_hadd_ps (ab_cd.simd(), ab_cd.simd());
7234     // all abcd elements are a+b+c+d
7235     return abcd;
7236 #elif OIIO_SIMD_SSE
7237     // I think this is the best we can do for SSE2, and I'm still not sure
7238     // it's faster than the default scalar operation. But anyway...
7239     // suppose v = (a, b, c, d)
7240     vfloat4 ab_ab_cd_cd = shuffle<1,0,3,2>(v) + v;
7241     // now x = (b,a,d,c) + (a,b,c,d) = (a+b,a+b,c+d,c+d)
7242     vfloat4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd);
7243     // now y = (c+d,c+d,a+b,a+b)
7244     vfloat4 abcd = ab_ab_cd_cd + cd_cd_ab_ab;   // a+b+c+d in all components
7245     return abcd;
7246 #else
7247     return vfloat4 (v[0] + v[1] + v[2] + v[3]);
7248 #endif
7249 }
7250 
7251 
7252 OIIO_FORCEINLINE float reduce_add (const vfloat4& v) {
7253 #if OIIO_SIMD_SSE
7254     return _mm_cvtss_f32(vreduce_add (v));
7255 #elif OIIO_SIMD_NEON
7256     return vaddvq_f32(v);
7257 #else
7258     return v[0] + v[1] + v[2] + v[3];
7259 #endif
7260 }
7261 
7262 OIIO_FORCEINLINE vfloat4 vdot (const vfloat4 &a, const vfloat4 &b) {
7263 #if OIIO_SIMD_SSE >= 4
7264     return _mm_dp_ps (a.simd(), b.simd(), 0xff);
7265 #elif OIIO_SIMD_NEON
7266     float32x4_t ab = vmulq_f32(a, b);
7267     float32x4_t sum1 = vaddq_f32(ab, vrev64q_f32(ab));
7268     return vaddq_f32(sum1, vcombine_f32(vget_high_f32(sum1), vget_low_f32(sum1)));
7269 #else
7270     return vreduce_add (a*b);
7271 #endif
7272 }
7273 
7274 OIIO_FORCEINLINE float dot (const vfloat4 &a, const vfloat4 &b) {
7275 #if OIIO_SIMD_SSE >= 4
7276     return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0xff));
7277 #else
7278     return reduce_add (a*b);
7279 #endif
7280 }
7281 
7282 OIIO_FORCEINLINE vfloat4 vdot3 (const vfloat4 &a, const vfloat4 &b) {
7283 #if OIIO_SIMD_SSE >= 4
7284     return _mm_dp_ps (a.simd(), b.simd(), 0x7f);
7285 #else
7286     return vreduce_add((a*b).xyz0());
7287 #endif
7288 }
7289 
7290 OIIO_FORCEINLINE float dot3 (const vfloat4 &a, const vfloat4 &b) {
7291 #if OIIO_SIMD_SSE >= 4
7292     return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0x77));
7293 #else
7294     return reduce_add ((a*b).xyz0());
7295 #endif
7296 }
7297 
7298 
7299 OIIO_FORCEINLINE vfloat4 blend (const vfloat4& a, const vfloat4& b, const vbool4& mask)
7300 {
7301 #if OIIO_SIMD_SSE >= 4
7302     // SSE >= 4.1 only
7303     return _mm_blendv_ps (a.simd(), b.simd(), mask.simd());
7304 #elif OIIO_SIMD_SSE
7305     // Trick for SSE < 4.1
7306     return _mm_or_ps (_mm_and_ps(mask.simd(), b.simd()),
7307                       _mm_andnot_ps(mask.simd(), a.simd()));
7308 #elif OIIO_SIMD_NEON
7309     return vbslq_f32 (mask.simd(), b.simd(), a.simd());
7310 #else
7311     return vfloat4 (mask[0] ? b[0] : a[0],
7312                    mask[1] ? b[1] : a[1],
7313                    mask[2] ? b[2] : a[2],
7314                    mask[3] ? b[3] : a[3]);
7315 #endif
7316 }
7317 
7318 
7319 OIIO_FORCEINLINE vfloat4 blend0 (const vfloat4& a, const vbool4& mask)
7320 {
7321 #if OIIO_SIMD_SSE
7322     return _mm_and_ps(mask.simd(), a.simd());
7323 #else
7324     return vfloat4 (mask[0] ? a[0] : 0.0f,
7325                    mask[1] ? a[1] : 0.0f,
7326                    mask[2] ? a[2] : 0.0f,
7327                    mask[3] ? a[3] : 0.0f);
7328 #endif
7329 }
7330 
7331 
7332 OIIO_FORCEINLINE vfloat4 blend0not (const vfloat4& a, const vbool4& mask)
7333 {
7334 #if OIIO_SIMD_SSE
7335     return _mm_andnot_ps(mask.simd(), a.simd());
7336 #else
7337     return vfloat4 (mask[0] ? 0.0f : a[0],
7338                    mask[1] ? 0.0f : a[1],
7339                    mask[2] ? 0.0f : a[2],
7340                    mask[3] ? 0.0f : a[3]);
7341 #endif
7342 }
7343 
7344 
7345 OIIO_FORCEINLINE vfloat4 safe_div (const vfloat4 &a, const vfloat4 &b) {
7346 #if OIIO_SIMD_SSE
7347     return blend0not (a/b, b == vfloat4::Zero());
7348 #else
7349     return vfloat4 (b[0] == 0.0f ? 0.0f : a[0] / b[0],
7350                    b[1] == 0.0f ? 0.0f : a[1] / b[1],
7351                    b[2] == 0.0f ? 0.0f : a[2] / b[2],
7352                    b[3] == 0.0f ? 0.0f : a[3] / b[3]);
7353 #endif
7354 }
7355 
7356 
7357 OIIO_FORCEINLINE vfloat3 hdiv (const vfloat4 &a)
7358 {
7359 #if OIIO_SIMD_SSE
7360     return vfloat3(safe_div(a, shuffle<3>(a)).xyz0());
7361 #else
7362     float d = a[3];
7363     return d == 0.0f ? vfloat3 (0.0f) : vfloat3 (a[0]/d, a[1]/d, a[2]/d);
7364 #endif
7365 }
7366 
7367 
7368 
7369 OIIO_FORCEINLINE vfloat4 select (const vbool4& mask, const vfloat4& a, const vfloat4& b)
7370 {
7371     return blend (b, a, mask);
7372 }
7373 
7374 
7375 OIIO_FORCEINLINE vfloat4 abs (const vfloat4& a)
7376 {
7377 #if OIIO_SIMD_SSE
7378     // Just clear the sign bit for cheap fabsf
7379     return _mm_and_ps (a.simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
7380 #elif OIIO_SIMD_NEON
7381     return vabsq_f32(a.simd());
7382 #else
7383     SIMD_RETURN (vfloat4, fabsf(a[i]));
7384 #endif
7385 }
7386 
7387 
7388 OIIO_FORCEINLINE vfloat4 sign (const vfloat4& a)
7389 {
7390     vfloat4 one(1.0f);
7391     return blend (one, -one, a < vfloat4::Zero());
7392 }
7393 
7394 
7395 OIIO_FORCEINLINE vfloat4 ceil (const vfloat4& a)
7396 {
7397 #if OIIO_SIMD_SSE >= 4  /* SSE >= 4.1 */
7398     return _mm_ceil_ps (a);
7399 #else
7400     SIMD_RETURN (vfloat4, ceilf(a[i]));
7401 #endif
7402 }
7403 
7404 OIIO_FORCEINLINE vfloat4 floor (const vfloat4& a)
7405 {
7406 #if OIIO_SIMD_SSE >= 4  /* SSE >= 4.1 */
7407     return _mm_floor_ps (a);
7408 #else
7409     SIMD_RETURN (vfloat4, floorf(a[i]));
7410 #endif
7411 }
7412 
7413 OIIO_FORCEINLINE vfloat4 round (const vfloat4& a)
7414 {
7415 #if OIIO_SIMD_SSE >= 4  /* SSE >= 4.1 */
7416     return _mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
7417 #else
7418     SIMD_RETURN (vfloat4, roundf(a[i]));
7419 #endif
7420 }
7421 
7422 OIIO_FORCEINLINE vint4 ifloor (const vfloat4& a)
7423 {
7424     // FIXME: look into this, versus the method of quick_floor in texturesys.cpp
7425 #if OIIO_SIMD_SSE >= 4  /* SSE >= 4.1 */
7426     return vint4(floor(a));
7427 #else
7428     SIMD_RETURN (vint4, (int)floorf(a[i]));
7429 #endif
7430 }
7431 
7432 
7433 OIIO_FORCEINLINE vint4 rint (const vfloat4& a)
7434 {
7435     return vint4 (round(a));
7436 }
7437 
7438 
7439 OIIO_FORCEINLINE vfloat4 rcp_fast (const vfloat4 &a)
7440 {
7441 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED
7442     // avx512vl directly has rcp14 on float4
7443     vfloat4 r = _mm_rcp14_ps(a);
7444     return r * nmadd(r,a,vfloat4(2.0f));
7445 #elif OIIO_SIMD_AVX512
7446     // Trickery: in and out of the 512 bit registers to use fast approx rcp
7447     vfloat16 r = _mm512_rcp14_ps(_mm512_castps128_ps512(a));
7448     return _mm512_castps512_ps128(r);
7449 #elif OIIO_SIMD_SSE
7450     vfloat4 r = _mm_rcp_ps(a);
7451     return r * nmadd(r,a,vfloat4(2.0f));
7452 #else
7453     SIMD_RETURN (vfloat4, 1.0f/a[i]);
7454 #endif
7455 }
7456 
7457 
7458 OIIO_FORCEINLINE vfloat4 sqrt (const vfloat4 &a)
7459 {
7460 #if OIIO_SIMD_SSE
7461     return _mm_sqrt_ps (a.simd());
7462 #else
7463     SIMD_RETURN (vfloat4, sqrtf(a[i]));
7464 #endif
7465 }
7466 
7467 
7468 OIIO_FORCEINLINE vfloat4 rsqrt (const vfloat4 &a)
7469 {
7470 #if OIIO_SIMD_SSE
7471     return _mm_div_ps (_mm_set1_ps(1.0f), _mm_sqrt_ps (a.simd()));
7472 #else
7473     SIMD_RETURN (vfloat4, 1.0f/sqrtf(a[i]));
7474 #endif
7475 }
7476 
7477 
7478 OIIO_FORCEINLINE vfloat4 rsqrt_fast (const vfloat4 &a)
7479 {
7480 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
7481     // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
7482     return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
7483 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7484     // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
7485     return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a)));
7486 #elif OIIO_SIMD_SSE
7487     return _mm_rsqrt_ps (a.simd());
7488 #else
7489     SIMD_RETURN (vfloat4, 1.0f/sqrtf(a[i]));
7490 #endif
7491 }
7492 
7493 
7494 OIIO_FORCEINLINE vfloat4 min (const vfloat4& a, const vfloat4& b)
7495 {
7496 #if OIIO_SIMD_SSE
7497     return _mm_min_ps (a, b);
7498 #elif OIIO_SIMD_NEON
7499     return vminq_f32(a, b);
7500 #else
7501     SIMD_RETURN (vfloat4, std::min (a[i], b[i]));
7502 #endif
7503 }
7504 
7505 OIIO_FORCEINLINE vfloat4 max (const vfloat4& a, const vfloat4& b)
7506 {
7507 #if OIIO_SIMD_SSE
7508     return _mm_max_ps (a, b);
7509 #elif OIIO_SIMD_NEON
7510     return vmaxq_f32(a, b);
7511 #else
7512     SIMD_RETURN (vfloat4, std::max (a[i], b[i]));
7513 #endif
7514 }
7515 
7516 
7517 OIIO_FORCEINLINE vfloat4 andnot (const vfloat4& a, const vfloat4& b) {
7518 #if OIIO_SIMD_SSE
7519     return _mm_andnot_ps (a.simd(), b.simd());
7520 #else
7521     const int *ai = (const int *)&a;
7522     const int *bi = (const int *)&b;
7523     return bitcast_to_float (vint4(~(ai[0]) & bi[0],
7524                                   ~(ai[1]) & bi[1],
7525                                   ~(ai[2]) & bi[2],
7526                                   ~(ai[3]) & bi[3]));
7527 #endif
7528 }
7529 
7530 
7531 OIIO_FORCEINLINE vfloat4 madd (const simd::vfloat4& a, const simd::vfloat4& b,
7532                               const simd::vfloat4& c)
7533 {
7534 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7535     // If we are sure _mm_fmadd_ps intrinsic is available, use it.
7536     return _mm_fmadd_ps (a, b, c);
7537 #elif OIIO_SIMD_NEON
7538     return vmlaq_f32(c.simd(), a.simd(), b.simd());
7539 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7540     // If we directly access the underlying __m128, on some platforms and
7541     // compiler flags, it will turn into fma anyway, even if we don't use
7542     // the intrinsic.
7543     return a.simd() * b.simd() + c.simd();
7544 #else
7545     // Fallback: just use regular math and hope for the best.
7546     return a * b + c;
7547 #endif
7548 }
7549 
7550 
7551 OIIO_FORCEINLINE vfloat4 msub (const simd::vfloat4& a, const simd::vfloat4& b,
7552                               const simd::vfloat4& c)
7553 {
7554 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7555     // If we are sure _mm_fnmsub_ps intrinsic is available, use it.
7556     return _mm_fmsub_ps (a, b, c);
7557 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7558     // If we directly access the underlying __m128, on some platforms and
7559     // compiler flags, it will turn into fma anyway, even if we don't use
7560     // the intrinsic.
7561     return a.simd() * b.simd() - c.simd();
7562 #else
7563     // Fallback: just use regular math and hope for the best.
7564     return a * b - c;
7565 #endif
7566 }
7567 
7568 
7569 
7570 OIIO_FORCEINLINE vfloat4 nmadd (const simd::vfloat4& a, const simd::vfloat4& b,
7571                                const simd::vfloat4& c)
7572 {
7573 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7574     // If we are sure _mm_fnmadd_ps intrinsic is available, use it.
7575     return _mm_fnmadd_ps (a, b, c);
7576 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7577     // If we directly access the underlying __m128, on some platforms and
7578     // compiler flags, it will turn into fma anyway, even if we don't use
7579     // the intrinsic.
7580     return c.simd() - a.simd() * b.simd();
7581 #else
7582     // Fallback: just use regular math and hope for the best.
7583     return c - a * b;
7584 #endif
7585 }
7586 
7587 
7588 
7589 OIIO_FORCEINLINE vfloat4 nmsub (const simd::vfloat4& a, const simd::vfloat4& b,
7590                                const simd::vfloat4& c)
7591 {
7592 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7593     // If we are sure _mm_fnmsub_ps intrinsic is available, use it.
7594     return _mm_fnmsub_ps (a, b, c);
7595 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7596     // If we directly access the underlying __m128, on some platforms and
7597     // compiler flags, it will turn into fma anyway, even if we don't use
7598     // the intrinsic.
7599     return -(a.simd() * b.simd()) - c.simd();
7600 #else
7601     // Fallback: just use regular math and hope for the best.
7602     return -(a * b) - c;
7603 #endif
7604 }
7605 
7606 
7607 
7608 // Full precision exp() of all components of a SIMD vector.
7609 template<typename T>
7610 OIIO_FORCEINLINE T exp (const T& v)
7611 {
7612 #if OIIO_SIMD_SSE
7613     // Implementation inspired by:
7614     // https://github.com/embree/embree/blob/master/common/simd/sse_special.h
7615     // Which is listed as Copyright (C) 2007  Julien Pommier and distributed
7616     // under the zlib license.
7617     typedef typename T::vint_t int_t;
7618     T x = v;
7619     const float exp_hi (88.3762626647949f);
7620     const float exp_lo (-88.3762626647949f);
7621     const float cephes_LOG2EF (1.44269504088896341f);
7622     const float cephes_exp_C1 (0.693359375f);
7623     const float cephes_exp_C2 (-2.12194440e-4f);
7624     const float cephes_exp_p0 (1.9875691500E-4f);
7625     const float cephes_exp_p1 (1.3981999507E-3f);
7626     const float cephes_exp_p2 (8.3334519073E-3f);
7627     const float cephes_exp_p3 (4.1665795894E-2f);
7628     const float cephes_exp_p4 (1.6666665459E-1f);
7629     const float cephes_exp_p5 (5.0000001201E-1f);
7630     T tmp (0.0f);
7631     T one (1.0f);
7632     x = min (x, T(exp_hi));
7633     x = max (x, T(exp_lo));
7634     T fx = madd (x, T(cephes_LOG2EF), T(0.5f));
7635     int_t emm0 = int_t(fx);
7636     tmp = T(emm0);
7637     T mask = bitcast_to_float (bitcast_to_int(tmp > fx) & bitcast_to_int(one));
7638     fx = tmp - mask;
7639     tmp = fx * cephes_exp_C1;
7640     T z = fx * cephes_exp_C2;
7641     x = x - tmp;
7642     x = x - z;
7643     z = x * x;
7644     T y = cephes_exp_p0;
7645     y = madd (y, x, cephes_exp_p1);
7646     y = madd (y, x, cephes_exp_p2);
7647     y = madd (y, x, cephes_exp_p3);
7648     y = madd (y, x, cephes_exp_p4);
7649     y = madd (y, x, cephes_exp_p5);
7650     y = madd (y, z, x);
7651     y = y + one;
7652     emm0 = (int_t(fx) + int_t(0x7f)) << 23;
7653     T pow2n = bitcast_to_float(emm0);
7654     y = y * pow2n;
7655     return y;
7656 #else
7657     SIMD_RETURN (T, expf(v[i]));
7658 #endif
7659 }
7660 
7661 
7662 
7663 // Full precision log() of all components of a SIMD vector.
7664 template<typename T>
7665 OIIO_FORCEINLINE T log (const T& v)
7666 {
7667 #if OIIO_SIMD_SSE
7668     // Implementation inspired by:
7669     // https://github.com/embree/embree/blob/master/common/simd/sse_special.h
7670     // Which is listed as Copyright (C) 2007  Julien Pommier and distributed
7671     // under the zlib license.
7672     typedef typename T::vint_t int_t;
7673     typedef typename T::vbool_t bool_t;
7674     T x = v;
7675     int_t emm0;
7676     T zero (T::Zero());
7677     T one (1.0f);
7678     bool_t invalid_mask = (x <= zero);
7679     const int min_norm_pos ((int)0x00800000);
7680     const int inv_mant_mask ((int)~0x7f800000);
7681     x = max(x, bitcast_to_float(int_t(min_norm_pos)));  /* cut off denormalized stuff */
7682     emm0 = srl (bitcast_to_int(x), 23);
7683     /* keep only the fractional part */
7684     x = bitcast_to_float (bitcast_to_int(x) & int_t(inv_mant_mask));
7685     x = bitcast_to_float (bitcast_to_int(x) | bitcast_to_int(T(0.5f)));
7686     emm0 = emm0 - int_t(0x7f);
7687     T e (emm0);
7688     e = e + one;
7689     // OIIO_SIMD_vFLOAT4_CONST (cephes_SQRTHF, 0.707106781186547524f);
7690     const float cephes_SQRTHF (0.707106781186547524f);
7691     bool_t mask = (x < T(cephes_SQRTHF));
7692     T tmp = bitcast_to_float (bitcast_to_int(x) & bitcast_to_int(mask));
7693     x = x - one;
7694     e = e - bitcast_to_float (bitcast_to_int(one) & bitcast_to_int(mask));
7695     x = x + tmp;
7696     T z = x * x;
7697     const float cephes_log_p0 (7.0376836292E-2f);
7698     const float cephes_log_p1 (- 1.1514610310E-1f);
7699     const float cephes_log_p2 (1.1676998740E-1f);
7700     const float cephes_log_p3 (- 1.2420140846E-1f);
7701     const float cephes_log_p4 (+ 1.4249322787E-1f);
7702     const float cephes_log_p5 (- 1.6668057665E-1f);
7703     const float cephes_log_p6 (+ 2.0000714765E-1f);
7704     const float cephes_log_p7 (- 2.4999993993E-1f);
7705     const float cephes_log_p8 (+ 3.3333331174E-1f);
7706     const float cephes_log_q1 (-2.12194440e-4f);
7707     const float cephes_log_q2 (0.693359375f);
7708     T y = cephes_log_p0;
7709     y = madd (y, x, T(cephes_log_p1));
7710     y = madd (y, x, T(cephes_log_p2));
7711     y = madd (y, x, T(cephes_log_p3));
7712     y = madd (y, x, T(cephes_log_p4));
7713     y = madd (y, x, T(cephes_log_p5));
7714     y = madd (y, x, T(cephes_log_p6));
7715     y = madd (y, x, T(cephes_log_p7));
7716     y = madd (y, x, T(cephes_log_p8));
7717     y = y * x;
7718     y = y * z;
7719     y = madd(e, T(cephes_log_q1), y);
7720     y = nmadd (z, 0.5f, y);
7721     x = x + y;
7722     x = madd (e, T(cephes_log_q2), x);
7723     x = bitcast_to_float (bitcast_to_int(x) | bitcast_to_int(invalid_mask)); // negative arg will be NAN
7724     return x;
7725 #else
7726     SIMD_RETURN (T, logf(v[i]));
7727 #endif
7728 }
7729 
7730 
7731 
7732 OIIO_FORCEINLINE void transpose (vfloat4 &a, vfloat4 &b, vfloat4 &c, vfloat4 &d)
7733 {
7734 #if OIIO_SIMD_SSE
7735     _MM_TRANSPOSE4_PS (a.simd(), b.simd(), c.simd(), d.simd());
7736 #else
7737     vfloat4 A (a[0], b[0], c[0], d[0]);
7738     vfloat4 B (a[1], b[1], c[1], d[1]);
7739     vfloat4 C (a[2], b[2], c[2], d[2]);
7740     vfloat4 D (a[3], b[3], c[3], d[3]);
7741     a = A;  b = B;  c = C;  d = D;
7742 #endif
7743 }
7744 
7745 
7746 OIIO_FORCEINLINE void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d,
7747                                  vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3)
7748 {
7749 #if OIIO_SIMD_SSE
7750     //_MM_TRANSPOSE4_PS (a, b, c, d);
7751     auto l02 = _mm_unpacklo_ps (a, c);
7752     auto h02 = _mm_unpackhi_ps (a, c);
7753     auto l13 = _mm_unpacklo_ps (b, d);
7754     auto h13 = _mm_unpackhi_ps (b, d);
7755     r0 = vfloat4(_mm_unpacklo_ps (l02, l13));
7756     r1 = vfloat4(_mm_unpackhi_ps (l02, l13));
7757     r2 = vfloat4(_mm_unpacklo_ps (h02, h13));
7758     r3 = vfloat4(_mm_unpackhi_ps (h02, h13));
7759 #else
7760     r0.load (a[0], b[0], c[0], d[0]);
7761     r1.load (a[1], b[1], c[1], d[1]);
7762     r2.load (a[2], b[2], c[2], d[2]);
7763     r3.load (a[3], b[3], c[3], d[3]);
7764 #endif
7765 }
7766 
7767 
7768 OIIO_FORCEINLINE void transpose (vint4 &a, vint4 &b, vint4 &c, vint4 &d)
7769 {
7770 #if OIIO_SIMD_SSE
7771     __m128 A = _mm_castsi128_ps (a);
7772     __m128 B = _mm_castsi128_ps (b);
7773     __m128 C = _mm_castsi128_ps (c);
7774     __m128 D = _mm_castsi128_ps (d);
7775     _MM_TRANSPOSE4_PS (A, B, C, D);
7776     a = _mm_castps_si128 (A);
7777     b = _mm_castps_si128 (B);
7778     c = _mm_castps_si128 (C);
7779     d = _mm_castps_si128 (D);
7780 #else
7781     vint4 A (a[0], b[0], c[0], d[0]);
7782     vint4 B (a[1], b[1], c[1], d[1]);
7783     vint4 C (a[2], b[2], c[2], d[2]);
7784     vint4 D (a[3], b[3], c[3], d[3]);
7785     a = A;  b = B;  c = C;  d = D;
7786 #endif
7787 }
7788 
7789 OIIO_FORCEINLINE void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d,
7790                                  vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3)
7791 {
7792 #if OIIO_SIMD_SSE
7793     //_MM_TRANSPOSE4_PS (a, b, c, d);
7794     __m128 A = _mm_castsi128_ps (a);
7795     __m128 B = _mm_castsi128_ps (b);
7796     __m128 C = _mm_castsi128_ps (c);
7797     __m128 D = _mm_castsi128_ps (d);
7798     _MM_TRANSPOSE4_PS (A, B, C, D);
7799     r0 = _mm_castps_si128 (A);
7800     r1 = _mm_castps_si128 (B);
7801     r2 = _mm_castps_si128 (C);
7802     r3 = _mm_castps_si128 (D);
7803 #else
7804     r0.load (a[0], b[0], c[0], d[0]);
7805     r1.load (a[1], b[1], c[1], d[1]);
7806     r2.load (a[2], b[2], c[2], d[2]);
7807     r3.load (a[3], b[3], c[3], d[3]);
7808 #endif
7809 }
7810 
7811 
7812 OIIO_FORCEINLINE vfloat4 AxBxCxDx (const vfloat4& a, const vfloat4& b,
7813                                   const vfloat4& c, const vfloat4& d)
7814 {
7815 #if OIIO_SIMD_SSE
7816     vfloat4 l02 = _mm_unpacklo_ps (a, c);
7817     vfloat4 l13 = _mm_unpacklo_ps (b, d);
7818     return _mm_unpacklo_ps (l02, l13);
7819 #else
7820     return vfloat4 (a[0], b[0], c[0], d[0]);
7821 #endif
7822 }
7823 
7824 
7825 OIIO_FORCEINLINE vint4 AxBxCxDx (const vint4& a, const vint4& b,
7826                                 const vint4& c, const vint4& d)
7827 {
7828 #if OIIO_SIMD_SSE
7829     vint4 l02 = _mm_unpacklo_epi32 (a, c);
7830     vint4 l13 = _mm_unpacklo_epi32 (b, d);
7831     return _mm_unpacklo_epi32 (l02, l13);
7832 #else
7833     return vint4 (a[0], b[0], c[0], d[0]);
7834 #endif
7835 }
7836 
7837 
7838 
7839 //////////////////////////////////////////////////////////////////////
7840 // vfloat3 implementation
7841 
7842 OIIO_FORCEINLINE vfloat3::vfloat3 (const vfloat3 &other)  : vfloat4(other) {
7843 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
7844     m_simd = other.m_simd;
7845 #else
7846     SIMD_CONSTRUCT_PAD (other[i]);
7847 #endif
7848 }
7849 
7850 OIIO_FORCEINLINE vfloat3::vfloat3 (const vfloat4 &other) {
7851 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
7852     m_simd = other.simd();
7853 #else
7854     SIMD_CONSTRUCT_PAD (other[i]);
7855     m_val[3] = 0.0f;
7856 #endif
7857 }
7858 
7859 OIIO_FORCEINLINE const vfloat3 vfloat3::Zero () { return vfloat3(vfloat4::Zero()); }
7860 
7861 OIIO_FORCEINLINE const vfloat3 vfloat3::One () { return vfloat3(1.0f); }
7862 
7863 OIIO_FORCEINLINE const vfloat3 vfloat3::Iota (float start, float step) {
7864     return vfloat3 (start+0.0f*step, start+1.0f*step, start+2.0f*step);
7865 }
7866 
7867 
7868 OIIO_FORCEINLINE void vfloat3::load (float val) { vfloat4::load (val, val, val, 0.0f); }
7869 
7870 OIIO_FORCEINLINE void vfloat3::load (const float *values) { vfloat4::load (values, 3); }
7871 
7872 OIIO_FORCEINLINE void vfloat3::load (const float *values, int n) {
7873     vfloat4::load (values, n);
7874 }
7875 
7876 OIIO_FORCEINLINE void vfloat3::load (const unsigned short *values) {
7877     vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
7878 }
7879 
7880 OIIO_FORCEINLINE void vfloat3::load (const short *values) {
7881     vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
7882 }
7883 
7884 OIIO_FORCEINLINE void vfloat3::load (const unsigned char *values) {
7885     vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
7886 }
7887 
7888 OIIO_FORCEINLINE void vfloat3::load (const char *values) {
7889     vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
7890 }
7891 
7892 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
7893 OIIO_FORCEINLINE void vfloat3::load (const half *values) {
7894     vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
7895 }
7896 #endif /* _HALF_H_ or _IMATH_H_ */
7897 
7898 OIIO_FORCEINLINE void vfloat3::store (float *values) const {
7899     vfloat4::store (values, 3);
7900 }
7901 
7902 OIIO_FORCEINLINE void vfloat3::store (float *values, int n) const {
7903     vfloat4::store (values, n);
7904 }
7905 
7906 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
7907 OIIO_FORCEINLINE void vfloat3::store (half *values) const {
7908     SIMD_DO (values[i] = m_val[i]);
7909 }
7910 #endif
7911 
7912 OIIO_FORCEINLINE void vfloat3::store (Imath::V3f &vec) const {
7913     store ((float *)&vec);
7914 }
7915 
7916 OIIO_FORCEINLINE vfloat3 operator+ (const vfloat3& a, const vfloat3& b) {
7917     return vfloat3 (vfloat4(a) + vfloat4(b));
7918 }
7919 
7920 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator+= (const vfloat3& a) {
7921     *this = *this + a; return *this;
7922 }
7923 
7924 OIIO_FORCEINLINE vfloat3 vfloat3::operator- () const {
7925     return vfloat3 (-vfloat4(*this));
7926 }
7927 
7928 OIIO_FORCEINLINE vfloat3 operator- (const vfloat3& a, const vfloat3& b) {
7929     return vfloat3 (vfloat4(a) - vfloat4(b));
7930 }
7931 
7932 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator-= (const vfloat3& a) {
7933     *this = *this - a; return *this;
7934 }
7935 
7936 OIIO_FORCEINLINE vfloat3 operator* (const vfloat3& a, const vfloat3& b) {
7937     return vfloat3 (vfloat4(a) * vfloat4(b));
7938 }
7939 
7940 OIIO_FORCEINLINE vfloat3 operator* (const vfloat3& a, float b) {
7941     return vfloat3 (vfloat4(a) * b);
7942 }
7943 
7944 OIIO_FORCEINLINE vfloat3 operator* (float a, const vfloat3& b) {
7945     return b * a;
7946 }
7947 
7948 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator*= (const vfloat3& a) {
7949     *this = *this * a; return *this;
7950 }
7951 
7952 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator*= (float a) {
7953     *this = *this * a; return *this;
7954 }
7955 
7956 OIIO_FORCEINLINE vfloat3 operator/ (const vfloat3& a, const vfloat3& b) {
7957     return vfloat3 (vfloat4(a) / b.xyz1()); // Avoid divide by zero!
7958 }
7959 
7960 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator/= (const vfloat3& a) {
7961     *this = *this / a; return *this;
7962 }
7963 
7964 OIIO_FORCEINLINE const vfloat3 & vfloat3::operator/= (float a) {
7965     *this = *this / a; return *this;
7966 }
7967 
7968 
7969 inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val) {
7970     cout << val[0];
7971     for (int i = 1; i < val.elements; ++i)
7972         cout << ' ' << val[i];
7973     return cout;
7974 }
7975 
7976 
7977 OIIO_FORCEINLINE vfloat3 abs (const vfloat3& a)
7978 {
7979 #if OIIO_SIMD_SSE
7980     // Just clear the sign bit for cheap fabsf
7981     return vfloat3(_mm_and_ps (a.simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
7982 #elif OIIO_SIMD_NEON
7983     return vfloat3(vabsq_f32(a.simd()));
7984 #else
7985     SIMD_RETURN (vfloat3, fabsf(a[i]));
7986 #endif
7987 }
7988 
7989 
7990 OIIO_FORCEINLINE vfloat3 sign (const vfloat3& a)
7991 {
7992     vfloat3 one(1.0f);
7993     return vfloat3(blend (one, -one, a < vfloat3::Zero()));
7994 }
7995 
7996 
7997 OIIO_FORCEINLINE vfloat3 ceil (const vfloat3& a)
7998 {
7999 #if OIIO_SIMD_SSE >= 4  /* SSE >= 4.1 */
8000     return vfloat3(_mm_ceil_ps (a));
8001 #else
8002     SIMD_RETURN (vfloat3, ceilf(a[i]));
8003 #endif
8004 }
8005 
8006 OIIO_FORCEINLINE vfloat3 floor (const vfloat3& a)
8007 {
8008 #if OIIO_SIMD_SSE >= 4  /* SSE >= 4.1 */
8009     return vfloat3(_mm_floor_ps (a));
8010 #else
8011     SIMD_RETURN (vfloat3, floorf(a[i]));
8012 #endif
8013 }
8014 
8015 OIIO_FORCEINLINE vfloat3 round (const vfloat3& a)
8016 {
8017 #if OIIO_SIMD_SSE >= 4  /* SSE >= 4.1 */
8018     return vfloat3(_mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)));
8019 #else
8020     SIMD_RETURN (vfloat3, roundf(a[i]));
8021 #endif
8022 }
8023 
8024 
8025 OIIO_FORCEINLINE vfloat3 vreduce_add (const vfloat3& v) {
8026 #if OIIO_SIMD_SSE
8027     return vfloat3 ((vreduce_add(vfloat4(v))).xyz0());
8028 #else
8029     return vfloat3 (v[0] + v[1] + v[2]);
8030 #endif
8031 }
8032 
8033 
8034 OIIO_FORCEINLINE vfloat3 vdot (const vfloat3 &a, const vfloat3 &b) {
8035 #if OIIO_SIMD_SSE >= 4
8036     return vfloat3(_mm_dp_ps (a.simd(), b.simd(), 0x77));
8037 #else
8038     return vreduce_add (a*b);
8039 #endif
8040 }
8041 
8042 
8043 OIIO_FORCEINLINE float dot (const vfloat3 &a, const vfloat3 &b) {
8044 #if OIIO_SIMD_SSE >= 4
8045     return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0x77));
8046 #elif OIIO_SIMD
8047     return reduce_add (a*b);
8048 #else
8049     return a[0]*b[0] + a[1]*b[1] + a[2]*b[2];
8050 #endif
8051 }
8052 
8053 
8054 OIIO_FORCEINLINE vfloat3 vdot3 (const vfloat3 &a, const vfloat3 &b) {
8055 #if OIIO_SIMD_SSE >= 4
8056     return vfloat3(_mm_dp_ps (a.simd(), b.simd(), 0x77));
8057 #else
8058     return vfloat3 (vreduce_add((a*b).xyz0()).xyz0());
8059 #endif
8060 }
8061 
8062 
8063 OIIO_FORCEINLINE float vfloat3::length2 () const
8064 {
8065     return dot(*this, *this);
8066 }
8067 
8068 
8069 OIIO_FORCEINLINE float vfloat3::length () const
8070 {
8071     return sqrtf(dot(*this, *this));
8072 }
8073 
8074 
8075 OIIO_FORCEINLINE vfloat3 vfloat3::normalized () const
8076 {
8077 #if OIIO_SIMD
8078     vfloat3 len2 = vdot3 (*this, *this);
8079     return vfloat3 (safe_div (*this, sqrt(len2)));
8080 #else
8081     float len2 = dot (*this, *this);
8082     return len2 > 0.0f ? (*this) / sqrtf(len2) : vfloat3::Zero();
8083 #endif
8084 }
8085 
8086 
8087 OIIO_FORCEINLINE vfloat3 vfloat3::normalized_fast () const
8088 {
8089 #if OIIO_SIMD
8090     vfloat3 len2 = vdot3 (*this, *this);
8091     vfloat4 invlen = blend0not (rsqrt_fast (len2), len2 == vfloat4::Zero());
8092     return vfloat3 ((*this) * invlen);
8093 #else
8094     float len2 = dot (*this, *this);
8095     return len2 > 0.0f ? (*this) / sqrtf(len2) : vfloat3::Zero();
8096 #endif
8097 }
8098 
8099 
8100 
8101 //////////////////////////////////////////////////////////////////////
8102 // matrix44 implementation
8103 
8104 
8105 OIIO_FORCEINLINE const Imath::M44f& matrix44::M44f() const {
8106     return *(Imath::M44f*)this;
8107 }
8108 
8109 
8110 OIIO_FORCEINLINE vfloat4 matrix44::operator[] (int i) const {
8111 #if OIIO_SIMD_SSE
8112     return m_row[i];
8113 #else
8114     return vfloat4 (m_mat[i]);
8115 #endif
8116 }
8117 
8118 
8119 OIIO_FORCEINLINE matrix44 matrix44::transposed () const {
8120     matrix44 T;
8121 #if OIIO_SIMD_SSE
8122     simd::transpose (m_row[0], m_row[1], m_row[2], m_row[3],
8123                      T.m_row[0], T.m_row[1], T.m_row[2], T.m_row[3]);
8124 #else
8125     T.m_mat = m_mat.transposed();
8126 #endif
8127     return T;
8128 }
8129 
8130 OIIO_FORCEINLINE vfloat3 matrix44::transformp (const vfloat3 &V) const {
8131 #if OIIO_SIMD_SSE
8132     vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8133                shuffle<2>(V) * m_row[2] + m_row[3];
8134     R = R / shuffle<3>(R);
8135     return vfloat3 (R.xyz0());
8136 #else
8137     Imath::V3f R;
8138     m_mat.multVecMatrix (*(Imath::V3f *)&V, R);
8139     return vfloat3(R);
8140 #endif
8141 }
8142 
8143 OIIO_FORCEINLINE vfloat3 matrix44::transformv (const vfloat3 &V) const {
8144 #if OIIO_SIMD_SSE
8145     vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8146                shuffle<2>(V) * m_row[2];
8147     return vfloat3 (R.xyz0());
8148 #else
8149     Imath::V3f R;
8150     m_mat.multDirMatrix (*(Imath::V3f *)&V, R);
8151     return vfloat3(R);
8152 #endif
8153 }
8154 
8155 OIIO_FORCEINLINE vfloat3 matrix44::transformvT (const vfloat3 &V) const {
8156 #if OIIO_SIMD_SSE
8157     matrix44 T = transposed();
8158     vfloat4 R = shuffle<0>(V) * T[0] + shuffle<1>(V) * T[1] +
8159                shuffle<2>(V) * T[2];
8160     return vfloat3 (R.xyz0());
8161 #else
8162     Imath::V3f R;
8163     m_mat.transposed().multDirMatrix (*(Imath::V3f *)&V, R);
8164     return vfloat3(R);
8165 #endif
8166 }
8167 
8168 OIIO_FORCEINLINE vfloat4 operator* (const vfloat4 &V, const matrix44& M)
8169 {
8170 #if OIIO_SIMD_SSE
8171     return shuffle<0>(V) * M[0] + shuffle<1>(V) * M[1] +
8172            shuffle<2>(V) * M[2] + shuffle<3>(V) * M[3];
8173 #else
8174     return vfloat4(V.V4f() * M.M44f());
8175 #endif
8176 }
8177 
8178 OIIO_FORCEINLINE vfloat4 operator* (const matrix44& M, const vfloat4 &V)
8179 {
8180 #if OIIO_SIMD_SSE >= 3
8181     vfloat4 m0v = M[0] * V;  // [ M00*Vx, M01*Vy, M02*Vz, M03*Vw ]
8182     vfloat4 m1v = M[1] * V;  // [ M10*Vx, M11*Vy, M12*Vz, M13*Vw ]
8183     vfloat4 m2v = M[2] * V;  // [ M20*Vx, M21*Vy, M22*Vz, M23*Vw ]
8184     vfloat4 m3v = M[3] * V;  // [ M30*Vx, M31*Vy, M32*Vz, M33*Vw ]
8185     vfloat4 s01 = _mm_hadd_ps(m0v, m1v);
8186        // [ M00*Vx + M01*Vy, M02*Vz + M03*Vw, M10*Vx + M11*Vy, M12*Vz + M13*Vw ]
8187     vfloat4 s23 = _mm_hadd_ps(m2v, m3v);
8188        // [ M20*Vx + M21*Vy, M22*Vz + M23*Vw, M30*Vx + M31*Vy, M32*Vz + M33*Vw ]
8189     vfloat4 result = _mm_hadd_ps(s01, s23);
8190        // [ M00*Vx + M01*Vy + M02*Vz + M03*Vw,
8191        //   M10*Vx + M11*Vy + M12*Vz + M13*Vw,
8192        //   M20*Vx + M21*Vy + M22*Vz + M23*Vw,
8193        //   M30*Vx + M31*Vy + M32*Vz + M33*Vw ]
8194     return result;
8195 #else
8196     return vfloat4(dot(M[0], V), dot(M[1], V), dot(M[2], V), dot(M[3], V));
8197 #endif
8198 }
8199 
8200 
8201 OIIO_FORCEINLINE bool matrix44::operator== (const matrix44& m) const {
8202 #if OIIO_SIMD_SSE
8203     vbool4 b0 = (m_row[0] == m[0]);
8204     vbool4 b1 = (m_row[1] == m[1]);
8205     vbool4 b2 = (m_row[2] == m[2]);
8206     vbool4 b3 = (m_row[3] == m[3]);
8207     return simd::all (b0 & b1 & b2 & b3);
8208 #else
8209     return memcmp(this, &m, 16*sizeof(float)) == 0;
8210 #endif
8211 }
8212 
8213 OIIO_FORCEINLINE bool matrix44::operator== (const Imath::M44f& m) const {
8214     return memcmp(this, &m, 16*sizeof(float)) == 0;
8215 }
8216 
8217 OIIO_FORCEINLINE bool operator== (const Imath::M44f& a, const matrix44 &b) {
8218     return (b == a);
8219 }
8220 
8221 OIIO_FORCEINLINE bool matrix44::operator!= (const matrix44& m) const {
8222 #if OIIO_SIMD_SSE
8223     vbool4 b0 = (m_row[0] != m[0]);
8224     vbool4 b1 = (m_row[1] != m[1]);
8225     vbool4 b2 = (m_row[2] != m[2]);
8226     vbool4 b3 = (m_row[3] != m[3]);
8227     return simd::any (b0 | b1 | b2 | b3);
8228 #else
8229     return memcmp(this, &m, 16*sizeof(float)) != 0;
8230 #endif
8231 }
8232 
8233 OIIO_FORCEINLINE bool matrix44::operator!= (const Imath::M44f& m) const {
8234     return memcmp(this, &m, 16*sizeof(float)) != 0;
8235 }
8236 
8237 OIIO_FORCEINLINE bool operator!= (const Imath::M44f& a, const matrix44 &b) {
8238     return (b != a);
8239 }
8240 
8241 OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
8242 #if OIIO_SIMD_SSE
8243     // Adapted from this code from Intel:
8244     // ftp://download.intel.com/design/pentiumiii/sml/24504301.pdf
8245     vfloat4 minor0, minor1, minor2, minor3;
8246     vfloat4 row0, row1, row2, row3;
8247     vfloat4 det, tmp1;
8248     const float *src = (const float *)this;
8249     vfloat4 zero = vfloat4::Zero();
8250     tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src)), (__m64*)(src+ 4)));
8251     row1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+8)), (__m64*)(src+12)));
8252     row0 = vfloat4(_mm_shuffle_ps(tmp1, row1, 0x88));
8253     row1 = vfloat4(_mm_shuffle_ps(row1, tmp1, 0xDD));
8254     tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)));
8255     row3 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+10)), (__m64*)(src+14)));
8256     row2 = vfloat4(_mm_shuffle_ps(tmp1, row3, 0x88));
8257     row3 = vfloat4(_mm_shuffle_ps(row3, tmp1, 0xDD));
8258     // -----------------------------------------------
8259     tmp1 = row2 * row3;
8260     tmp1 = shuffle<1,0,3,2>(tmp1);
8261     minor0 = row1 * tmp1;
8262     minor1 = row0 * tmp1;
8263     tmp1 = shuffle<2,3,0,1>(tmp1);
8264     minor0 = (row1 * tmp1) - minor0;
8265     minor1 = (row0 * tmp1) - minor1;
8266     minor1 = shuffle<2,3,0,1>(minor1);
8267     // -----------------------------------------------
8268     tmp1 = row1 * row2;
8269     tmp1 = shuffle<1,0,3,2>(tmp1);
8270     minor0 = (row3 * tmp1) + minor0;
8271     minor3 = row0 * tmp1;
8272     tmp1 = shuffle<2,3,0,1>(tmp1);
8273     minor0 = minor0 - (row3 * tmp1);
8274     minor3 = (row0 * tmp1) - minor3;
8275     minor3 = shuffle<2,3,0,1>(minor3);
8276     // -----------------------------------------------
8277     tmp1 = shuffle<2,3,0,1>(row1) * row3;
8278     tmp1 = shuffle<1,0,3,2>(tmp1);
8279     row2 = shuffle<2,3,0,1>(row2);
8280     minor0 = (row2 * tmp1) + minor0;
8281     minor2 = row0 * tmp1;
8282     tmp1 = shuffle<2,3,0,1>(tmp1);
8283     minor0 = minor0 - (row2 * tmp1);
8284     minor2 = (row0 * tmp1) - minor2;
8285     minor2 = shuffle<2,3,0,1>(minor2);
8286     // -----------------------------------------------
8287     tmp1 = row0 * row1;
8288     tmp1 = shuffle<1,0,3,2>(tmp1);
8289     minor2 = (row3 * tmp1) + minor2;
8290     minor3 = (row2 * tmp1) - minor3;
8291     tmp1 = shuffle<2,3,0,1>(tmp1);
8292     minor2 = (row3 * tmp1) - minor2;
8293     minor3 = minor3 - (row2 * tmp1);
8294     // -----------------------------------------------
8295     tmp1 = row0 * row3;
8296     tmp1 = shuffle<1,0,3,2>(tmp1);
8297     minor1 = minor1 - (row2 * tmp1);
8298     minor2 = (row1 * tmp1) + minor2;
8299     tmp1 = shuffle<2,3,0,1>(tmp1);
8300     minor1 = (row2 * tmp1) + minor1;
8301     minor2 = minor2 - (row1 * tmp1);
8302     // -----------------------------------------------
8303     tmp1 = row0 * row2;
8304     tmp1 = shuffle<1,0,3,2>(tmp1);
8305     minor1 = (row3 * tmp1) + minor1;
8306     minor3 = minor3 - (row1 * tmp1);
8307     tmp1 = shuffle<2,3,0,1>(tmp1);
8308     minor1 = minor1 - (row3 * tmp1);
8309     minor3 = (row1 * tmp1) + minor3;
8310     // -----------------------------------------------
8311     det = row0 * minor0;
8312     det = shuffle<2,3,0,1>(det) + det;
8313     det = vfloat4(_mm_add_ss(shuffle<1,0,3,2>(det), det));
8314     tmp1 = vfloat4(_mm_rcp_ss(det));
8315     det = vfloat4(_mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))));
8316     det = shuffle<0>(det);
8317     return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
8318 #else
8319     return matrix44 (m_mat.inverse());
8320 #endif
8321 }
8322 
8323 
8324 inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M) {
8325     const float *m = (const float *)&M;
8326     cout << m[0];
8327     for (int i = 1; i < 16; ++i)
8328         cout << ' ' << m[i];
8329     return cout;
8330 }
8331 
8332 
8333 
8334 OIIO_FORCEINLINE vfloat3 transformp (const matrix44 &M, const vfloat3 &V) {
8335     return M.transformp (V);
8336 }
8337 
8338 OIIO_FORCEINLINE vfloat3 transformp (const Imath::M44f &M, const vfloat3 &V)
8339 {
8340 #if OIIO_SIMD
8341     return matrix44(M).transformp (V);
8342 #else
8343     Imath::V3f R;
8344     M.multVecMatrix (*(const Imath::V3f *)&V, R);
8345     return vfloat3(R);
8346 #endif
8347 }
8348 
8349 
8350 OIIO_FORCEINLINE vfloat3 transformv (const matrix44 &M, const vfloat3 &V) {
8351     return M.transformv (V);
8352 }
8353 
8354 OIIO_FORCEINLINE vfloat3 transformv (const Imath::M44f &M, const vfloat3 &V)
8355 {
8356 #if OIIO_SIMD
8357     return matrix44(M).transformv (V);
8358 #else
8359     Imath::V3f R;
8360     M.multDirMatrix (*(const Imath::V3f *)&V, R);
8361     return vfloat3(R);
8362 #endif
8363 }
8364 
8365 OIIO_FORCEINLINE vfloat3 transformvT (const matrix44 &M, const vfloat3 &V)
8366 {
8367     return M.transformvT (V);
8368 }
8369 
8370 OIIO_FORCEINLINE vfloat3 transformvT (const Imath::M44f &M, const vfloat3 &V)
8371 {
8372 #if OIIO_SIMD
8373     return matrix44(M).transformvT(V);
8374 #else
8375     return transformv (M.transposed(), V);
8376 #endif
8377 }
8378 
8379 
8380 
8381 //////////////////////////////////////////////////////////////////////
8382 // vfloat8 implementation
8383 
8384 OIIO_FORCEINLINE float& vfloat8::operator[] (int i) {
8385     OIIO_DASSERT(i<elements);
8386     return m_val[i];
8387 }
8388 
8389 OIIO_FORCEINLINE float vfloat8::operator[] (int i) const {
8390     OIIO_DASSERT(i<elements);
8391     return m_val[i];
8392 }
8393 
8394 
8395 inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val) {
8396     cout << val[0];
8397     for (int i = 1; i < val.elements; ++i)
8398         cout << ' ' << val[i];
8399     return cout;
8400 }
8401 
8402 
8403 OIIO_FORCEINLINE vfloat4 vfloat8::lo () const {
8404 #if OIIO_SIMD_AVX
8405     return _mm256_castps256_ps128 (simd());
8406 #else
8407     return m_4[0];
8408 #endif
8409 }
8410 
8411 OIIO_FORCEINLINE vfloat4 vfloat8::hi () const {
8412 #if OIIO_SIMD_AVX
8413     return _mm256_extractf128_ps (simd(), 1);
8414 #else
8415     return m_4[1];
8416 #endif
8417 }
8418 
8419 
8420 OIIO_FORCEINLINE vfloat8::vfloat8 (const vfloat4& lo, const vfloat4 &hi) {
8421 #if OIIO_SIMD_AVX
8422     __m256 r = _mm256_castps128_ps256 (lo);
8423     m_simd = _mm256_insertf128_ps (r, hi, 1);
8424     // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo);
8425     // FIXME: when would that not be available?
8426 #else
8427     m_4[0] = lo;
8428     m_4[1] = hi;
8429 #endif
8430 }
8431 
8432 
8433 OIIO_FORCEINLINE vfloat8::vfloat8 (const vint8& ival) {
8434 #if OIIO_SIMD_AVX
8435     m_simd = _mm256_cvtepi32_ps (ival);
8436 #else
8437     SIMD_CONSTRUCT (float(ival[i]));
8438 #endif
8439 }
8440 
8441 
8442 OIIO_FORCEINLINE const vfloat8 vfloat8::Zero () {
8443 #if OIIO_SIMD_AVX
8444     return _mm256_setzero_ps();
8445 #else
8446     return vfloat8(0.0f);
8447 #endif
8448 }
8449 
8450 OIIO_FORCEINLINE const vfloat8 vfloat8::One () {
8451     return vfloat8(1.0f);
8452 }
8453 
8454 OIIO_FORCEINLINE const vfloat8 vfloat8::Iota (float start, float step) {
8455     return vfloat8 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step,
8456                    start+4.0f*step, start+5.0f*step, start+6.0f*step, start+7.0f*step);
8457 }
8458 
8459 /// Set all components to 0.0
8460 OIIO_FORCEINLINE void vfloat8::clear () {
8461 #if OIIO_SIMD_AVX
8462     m_simd = _mm256_setzero_ps();
8463 #else
8464     load (0.0f);
8465 #endif
8466 }
8467 
8468 
8469 
8470 OIIO_FORCEINLINE void vfloat8::load (float val) {
8471 #if OIIO_SIMD_AVX
8472     m_simd = _mm256_set1_ps (val);
8473 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8474     m_4[0].load(val);
8475     m_4[1].load(val);
8476 #else
8477     SIMD_CONSTRUCT (val);
8478 #endif
8479 }
8480 
8481 OIIO_FORCEINLINE void vfloat8::load (float a, float b, float c, float d,
8482                                     float e, float f, float g, float h) {
8483 #if OIIO_SIMD_AVX
8484     m_simd = _mm256_set_ps (h, g, f, e, d, c, b, a);
8485 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8486     m_4[0].load(a, b, c, d);
8487     m_4[1].load(e, f, g, h);
8488 #else
8489     m_val[0] = a;
8490     m_val[1] = b;
8491     m_val[2] = c;
8492     m_val[3] = d;
8493     m_val[4] = e;
8494     m_val[5] = f;
8495     m_val[6] = g;
8496     m_val[7] = h;
8497 #endif
8498 }
8499 
8500 
8501 OIIO_FORCEINLINE void vfloat8::load (const float *values) {
8502 #if OIIO_SIMD_AVX
8503     m_simd = _mm256_loadu_ps (values);
8504 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8505     m_4[0].load(values);
8506     m_4[1].load(values+4);
8507 #else
8508     SIMD_CONSTRUCT (values[i]);
8509 #endif
8510 }
8511 
8512 
8513 OIIO_FORCEINLINE void vfloat8::load (const float *values, int n) {
8514     OIIO_DASSERT (n >= 0 && n <= elements);
8515 #if 0 && OIIO_AVX512VL_ENABLED
8516     // This SHOULD be fast, but in my benchmarks, it is slower!
8517     // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
8518     // Re-test this periodically with new Intel hardware.
8519     m_simd = _mm256_maskz_loadu_ps ((~(0xff << n)), values);
8520 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8521     if (n > 4) {
8522         vfloat4 lo, hi;
8523         lo.load (values);
8524         hi.load (values+4, n-4);
8525         m_4[0] = lo;
8526         m_4[1] = hi;
8527     } else {
8528         vfloat4 lo, hi;
8529         lo.load (values, n);
8530         hi.clear();
8531         m_4[0] = lo;
8532         m_4[1] = hi;
8533     }
8534 #else
8535     for (int i = 0; i < n; ++i)
8536         m_val[i] = values[i];
8537     for (int i = n; i < paddedelements; ++i)
8538         m_val[i] = 0;
8539 #endif
8540 }
8541 
8542 
8543 OIIO_FORCEINLINE void vfloat8::load (const unsigned short *values) {
8544 #if OIIO_SIMD_AVX
8545     // Rely on the ushort->int conversion, then convert to float
8546     m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8547 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8548     m_4[0].load(values);
8549     m_4[1].load(values+4);
8550 #else
8551     SIMD_CONSTRUCT (values[i]);
8552 #endif
8553 }
8554 
8555 
8556 OIIO_FORCEINLINE void vfloat8::load (const short *values) {
8557 #if OIIO_SIMD_AVX
8558     // Rely on the short->int conversion, then convert to float
8559     m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8560 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8561     m_4[0].load(values);
8562     m_4[1].load(values+4);
8563 #else
8564     SIMD_CONSTRUCT (values[i]);
8565 #endif
8566 }
8567 
8568 
8569 OIIO_FORCEINLINE void vfloat8::load (const unsigned char *values) {
8570 #if OIIO_SIMD_AVX
8571     m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8572 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8573     m_4[0].load(values);
8574     m_4[1].load(values+4);
8575 #else
8576     SIMD_CONSTRUCT (values[i]);
8577 #endif
8578 }
8579 
8580 
8581 OIIO_FORCEINLINE void vfloat8::load (const char *values) {
8582 #if OIIO_SIMD_AVX
8583     m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8584 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8585     m_4[0].load(values);
8586     m_4[1].load(values+4);
8587 #else
8588     SIMD_CONSTRUCT (values[i]);
8589 #endif
8590 }
8591 
8592 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8593 OIIO_FORCEINLINE void vfloat8::load (const half *values) {
8594 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED
8595     /* Enabled 16 bit float instructions! */
8596     vint4 a ((const int *)values);
8597     m_simd = _mm256_cvtph_ps (a);
8598 #elif OIIO_SIMD_SSE >= 2
8599     m_4[0] = vfloat4(values);
8600     m_4[1] = vfloat4(values+4);
8601 #else /* No SIMD defined: */
8602     SIMD_CONSTRUCT (values[i]);
8603 #endif
8604 }
8605 #endif /* _HALF_H_ or _IMATH_H_ */
8606 
8607 
8608 OIIO_FORCEINLINE void vfloat8::store (float *values) const {
8609 #if OIIO_SIMD_AVX
8610     // Use an unaligned store -- it's just as fast when the memory turns
8611     // out to be aligned, nearly as fast even when unaligned. Not worth
8612     // the headache of using stores that require alignment.
8613     _mm256_storeu_ps (values, m_simd);
8614 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8615     m_4[0].store(values);
8616     m_4[1].store(values+4);
8617 #else
8618     SIMD_DO (values[i] = m_val[i]);
8619 #endif
8620 }
8621 
8622 
8623 OIIO_FORCEINLINE void vfloat8::store (float *values, int n) const {
8624     OIIO_DASSERT (n >= 0 && n <= elements);
8625 #if 0 && OIIO_AVX512VL_ENABLED
8626     // This SHOULD be fast, but in my benchmarks, it is slower!
8627     // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
8628     // Re-test this periodically with new Intel hardware.
8629     _mm256_mask_storeu_ps (values,  __mmask8(~(0xff << n)), m_simd);
8630 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8631     if (n <= 4) {
8632         lo().store (values, n);
8633     } else if (n <= 8) {
8634         lo().store (values);
8635         hi().store (values+4, n-4);
8636     }
8637 #else
8638     for (int i = 0; i < n; ++i)
8639         values[i] = m_val[i];
8640 #endif
8641 }
8642 
8643 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8644 OIIO_FORCEINLINE void vfloat8::store (half *values) const {
8645 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED
8646     __m128i h = _mm256_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
8647     _mm_storeu_si128 ((__m128i *)values, h);
8648 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8649     m_4[0].store(values);
8650     m_4[1].store(values+4);
8651 #else
8652     SIMD_DO (values[i] = m_val[i]);
8653 #endif
8654 }
8655 #endif
8656 
8657 
8658 OIIO_FORCEINLINE void vfloat8::load_mask (int mask, const float *values) {
8659 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8660     m_simd = _mm256_maskz_loadu_ps (__mmask8(mask), (const simd_t *)values);
8661 #elif OIIO_SIMD_AVX
8662     m_simd = _mm256_maskload_ps (values, _mm256_castps_si256(vbool8::from_bitmask(mask)));
8663 #else
8664     SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0.0f);
8665 #endif
8666 }
8667 
8668 
8669 OIIO_FORCEINLINE void vfloat8::load_mask (const vbool8& mask, const float *values) {
8670 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8671     m_simd = _mm256_maskz_loadu_ps (__mmask8(mask.bitmask()), (const simd_t *)values);
8672 #elif OIIO_SIMD_AVX
8673     m_simd = _mm256_maskload_ps (values, _mm256_castps_si256(mask));
8674 #else
8675     SIMD_CONSTRUCT (mask[i] ? values[i] : 0.0f);
8676 #endif
8677 }
8678 
8679 
8680 OIIO_FORCEINLINE void vfloat8::store_mask (int mask, float *values) const {
8681 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8682     _mm256_mask_storeu_ps (values, __mmask8(mask), m_simd);
8683 #elif OIIO_SIMD_AVX
8684     _mm256_maskstore_ps (values, _mm256_castps_si256(vbool8::from_bitmask(mask)), m_simd);
8685 #else
8686     SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
8687 #endif
8688 }
8689 
8690 
8691 OIIO_FORCEINLINE void vfloat8::store_mask (const vbool8& mask, float *values) const {
8692 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8693     _mm256_mask_storeu_ps (values, __mmask8(mask.bitmask()), m_simd);
8694 #elif OIIO_SIMD_AVX
8695     _mm256_maskstore_ps (values, _mm256_castps_si256(mask.simd()), m_simd);
8696 #else
8697     SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
8698 #endif
8699 }
8700 
8701 
8702 template <int scale>
8703 OIIO_FORCEINLINE void
8704 vfloat8::gather (const value_t *baseptr, const vint_t& vindex)
8705 {
8706 #if OIIO_SIMD_AVX >= 2
8707     m_simd = _mm256_i32gather_ps (baseptr, vindex, scale);
8708 #else
8709     SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
8710 #endif
8711 }
8712 
8713 template<int scale>
8714 OIIO_FORCEINLINE void
8715 vfloat8::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex)
8716 {
8717 #if OIIO_SIMD_AVX >= 2
8718     m_simd = _mm256_mask_i32gather_ps (m_simd, baseptr, vindex, mask, scale);
8719 #else
8720     SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0);
8721 #endif
8722 }
8723 
8724 template<int scale>
8725 OIIO_FORCEINLINE void
8726 vfloat8::scatter (value_t *baseptr, const vint_t& vindex) const
8727 {
8728 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8729     _mm256_i32scatter_ps (baseptr, vindex, m_simd, scale);
8730 #else
8731     SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
8732 #endif
8733 }
8734 
8735 template<int scale>
8736 OIIO_FORCEINLINE void
8737 vfloat8::scatter_mask (const bool_t& mask, value_t *baseptr,
8738                        const vint_t& vindex) const
8739 {
8740 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8741     _mm256_mask_i32scatter_ps (baseptr, mask.bitmask(), vindex, m_simd, scale);
8742 #else
8743     SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
8744 #endif
8745 }
8746 
8747 
8748 
8749 OIIO_FORCEINLINE vfloat8 operator+ (const vfloat8& a, const vfloat8& b) {
8750 #if OIIO_SIMD_AVX
8751     return _mm256_add_ps (a, b);
8752 #else
8753     return vfloat8 (a.lo()+b.lo(), a.hi()+b.hi());
8754 #endif
8755 }
8756 
8757 OIIO_FORCEINLINE const vfloat8 & operator+= (vfloat8 & a, const vfloat8& b) {
8758     return a = a + b;
8759 }
8760 
8761 OIIO_FORCEINLINE vfloat8 operator- (const vfloat8& a) {
8762 #if OIIO_SIMD_AVX
8763     return _mm256_sub_ps (_mm256_setzero_ps(), a);
8764 #else
8765     return vfloat8 (-a.lo(), -a.hi());
8766 #endif
8767 }
8768 
8769 OIIO_FORCEINLINE vfloat8 operator- (const vfloat8& a, const vfloat8& b) {
8770 #if OIIO_SIMD_AVX
8771     return _mm256_sub_ps (a, b);
8772 #else
8773     return vfloat8 (a.lo()-b.lo(), a.hi()-b.hi());
8774 #endif
8775 }
8776 
8777 OIIO_FORCEINLINE const vfloat8 & operator-= (vfloat8 & a, const vfloat8& b) {
8778     return a = a - b;
8779 }
8780 
8781 OIIO_FORCEINLINE vfloat8 operator* (const vfloat8& a, float b) {
8782 #if OIIO_SIMD_AVX
8783     return _mm256_mul_ps (a.m_simd, _mm256_set1_ps(b));
8784 #else
8785     return vfloat8 (a.lo()*b, a.hi()*b);
8786 #endif
8787 }
8788 
8789 OIIO_FORCEINLINE vfloat8 operator* (float a, const vfloat8& b) {
8790     return b * a;
8791 }
8792 
8793 OIIO_FORCEINLINE vfloat8 operator* (const vfloat8& a, const vfloat8& b) {
8794 #if OIIO_SIMD_AVX
8795     return _mm256_mul_ps (a, b);
8796 #else
8797     return vfloat8 (a.lo()*b.lo(), a.hi()*b.hi());
8798 #endif
8799 }
8800 
8801 OIIO_FORCEINLINE const vfloat8 & operator*= (vfloat8 & a, const vfloat8& b) {
8802     return a = a * b;
8803 }
8804 
8805 OIIO_FORCEINLINE vfloat8 operator/ (const vfloat8& a, const vfloat8& b) {
8806 #if OIIO_SIMD_AVX
8807     return _mm256_div_ps (a, b);
8808 #else
8809     return vfloat8 (a.lo()/b.lo(), a.hi()/b.hi());
8810 #endif
8811 }
8812 
8813 OIIO_FORCEINLINE const vfloat8 & operator/= (vfloat8 & a, const vfloat8& b) {
8814     return a = a / b;
8815 }
8816 
8817 OIIO_FORCEINLINE vbool8 operator== (const vfloat8& a, const vfloat8& b) {
8818 #if OIIO_SIMD_AVX
8819     return _mm256_cmp_ps (a, b, _CMP_EQ_OQ);
8820 #else
8821     return vbool8 (a.lo() == b.lo(), a.hi() == b.hi());
8822 #endif
8823 }
8824 
8825 OIIO_FORCEINLINE vbool8 operator!= (const vfloat8& a, const vfloat8& b) {
8826 #if OIIO_SIMD_AVX
8827     return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ);
8828 #else
8829     return vbool8 (a.lo() != b.lo(), a.hi() != b.hi());
8830 #endif
8831 }
8832 
8833 OIIO_FORCEINLINE vbool8 operator< (const vfloat8& a, const vfloat8& b) {
8834 #if OIIO_SIMD_AVX
8835     return _mm256_cmp_ps (a, b, _CMP_LT_OQ);
8836 #else
8837     return vbool8 (a.lo() < b.lo(), a.hi() < b.hi());
8838 #endif
8839 }
8840 
8841 OIIO_FORCEINLINE vbool8 operator>  (const vfloat8& a, const vfloat8& b) {
8842 #if OIIO_SIMD_AVX
8843     return _mm256_cmp_ps (a, b, _CMP_GT_OQ);
8844 #else
8845     return vbool8 (a.lo() > b.lo(), a.hi() > b.hi());
8846 #endif
8847 }
8848 
8849 OIIO_FORCEINLINE vbool8 operator>= (const vfloat8& a, const vfloat8& b) {
8850 #if OIIO_SIMD_AVX
8851     return _mm256_cmp_ps (a, b, _CMP_GE_OQ);
8852 #else
8853     return vbool8 (a.lo() >= b.lo(), a.hi() >= b.hi());
8854 #endif
8855 }
8856 
8857 OIIO_FORCEINLINE vbool8 operator<= (const vfloat8& a, const vfloat8& b) {
8858 #if OIIO_SIMD_AVX
8859     return _mm256_cmp_ps (a, b, _CMP_LE_OQ);
8860 #else
8861     return vbool8 (a.lo() <= b.lo(), a.hi() <= b.hi());
8862 #endif
8863 }
8864 
8865 
8866 // Implementation had to be after the definition of vfloat8.
8867 OIIO_FORCEINLINE vint8::vint8 (const vfloat8& f)
8868 {
8869 #if OIIO_SIMD_AVX
8870     m_simd = _mm256_cvttps_epi32(f);
8871 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8872     *this = vint8 (vint4(f.lo()), vint4(f.hi()));
8873 #else
8874     SIMD_CONSTRUCT ((int) f[i]);
8875 #endif
8876 }
8877 
8878 
8879 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
8880 OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) {
8881 #if OIIO_SIMD_AVX >= 2
8882     vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
8883     return _mm256_permutevar8x32_ps (a, index);
8884 #else
8885     return vfloat8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
8886 #endif
8887 }
8888 
8889 template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) {
8890 #if OIIO_SIMD_AVX >= 2
8891     return _mm256_permutevar8x32_ps (a, vint8(i));
8892 #else
8893     return shuffle<i,i,i,i,i,i,i,i>(a);
8894 #endif
8895 }
8896 
8897 
8898 template<int i>
8899 OIIO_FORCEINLINE float extract (const vfloat8& v) {
8900 #if OIIO_SIMD_AVX_NO_FIXME
8901     // Looks like the fastest we can do it is to extract a vfloat4,
8902     // shuffle its one element everywhere, then extract element 0.
8903     _m128 f4 = _mm256_extractf128_ps (i >> 2);
8904     int j = i & 3;
8905     return _mm_cvtss_f32(shuffle_sse<j,j,j,j>(a.simd()));
8906 #else
8907     return v[i];
8908 #endif
8909 }
8910 
8911 
8912 template<int i>
8913 OIIO_FORCEINLINE vfloat8 insert (const vfloat8& a, float val) {
8914 #if OIIO_SIMD_AVX_NO_FIXME
8915     return _mm256_insert_epi32 (a, val, i);
8916 #else
8917     vfloat8 tmp = a;
8918     tmp[i] = val;
8919     return tmp;
8920 #endif
8921 }
8922 
8923 
8924 OIIO_FORCEINLINE float vfloat8::x () const { return extract<0>(*this); }
8925 OIIO_FORCEINLINE float vfloat8::y () const { return extract<1>(*this); }
8926 OIIO_FORCEINLINE float vfloat8::z () const { return extract<2>(*this); }
8927 OIIO_FORCEINLINE float vfloat8::w () const { return extract<3>(*this); }
8928 OIIO_FORCEINLINE void vfloat8::set_x (float val) { *this = insert<0>(*this, val); }
8929 OIIO_FORCEINLINE void vfloat8::set_y (float val) { *this = insert<1>(*this, val); }
8930 OIIO_FORCEINLINE void vfloat8::set_z (float val) { *this = insert<2>(*this, val); }
8931 OIIO_FORCEINLINE void vfloat8::set_w (float val) { *this = insert<3>(*this, val); }
8932 
8933 
8934 OIIO_FORCEINLINE vint8 bitcast_to_int (const vfloat8& x)
8935 {
8936 #if OIIO_SIMD_AVX
8937     return _mm256_castps_si256 (x.simd());
8938 #else
8939     return *(vint8 *)&x;
8940 #endif
8941 }
8942 
8943 OIIO_FORCEINLINE vfloat8 bitcast_to_float (const vint8& x)
8944 {
8945 #if OIIO_SIMD_AVX
8946     return _mm256_castsi256_ps (x.simd());
8947 #else
8948     return *(vfloat8 *)&x;
8949 #endif
8950 }
8951 
8952 
8953 OIIO_FORCEINLINE vfloat8 vreduce_add (const vfloat8& v) {
8954 #if OIIO_SIMD_AVX
8955     // From Syrah:
8956     vfloat8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_ps(v.simd(), _mm256_setzero_ps());
8957     vfloat8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_ps(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_ps());
8958     // get efgh in the 0-idx slot
8959     vfloat8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
8960     vfloat8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
8961     return shuffle<0>(final_sum);
8962 #else
8963     vfloat4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi());
8964     return vfloat8(hadd4, hadd4);
8965 #endif
8966 }
8967 
8968 
8969 OIIO_FORCEINLINE float reduce_add (const vfloat8& v) {
8970 #if OIIO_SIMD_AVX >= 2
8971     return extract<0>(vreduce_add(v));
8972 #else
8973     return reduce_add(v.lo()) + reduce_add(v.hi());
8974 #endif
8975 }
8976 
8977 
8978 OIIO_FORCEINLINE vfloat8 blend (const vfloat8& a, const vfloat8& b, const vbool8& mask)
8979 {
8980 #if OIIO_SIMD_AVX
8981     return _mm256_blendv_ps (a, b, mask);
8982 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8983     return vfloat8 (blend (a.lo(), b.lo(), mask.lo()),
8984                    blend (a.hi(), b.hi(), mask.hi()));
8985 #else
8986     SIMD_RETURN (vfloat8, mask[i] ? b[i] : a[i]);
8987 #endif
8988 }
8989 
8990 
8991 OIIO_FORCEINLINE vfloat8 blend0 (const vfloat8& a, const vbool8& mask)
8992 {
8993 #if OIIO_SIMD_AVX
8994     return _mm256_and_ps(mask, a);
8995 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8996     return vfloat8 (blend0 (a.lo(), mask.lo()),
8997                    blend0 (a.hi(), mask.hi()));
8998 #else
8999     SIMD_RETURN (vfloat8, mask[i] ? a[i] : 0.0f);
9000 #endif
9001 }
9002 
9003 
9004 OIIO_FORCEINLINE vfloat8 blend0not (const vfloat8& a, const vbool8& mask)
9005 {
9006 #if OIIO_SIMD_AVX
9007     return _mm256_andnot_ps(mask, a);
9008 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9009     return vfloat8 (blend0not (a.lo(), mask.lo()),
9010                    blend0not (a.hi(), mask.hi()));
9011 #else
9012     SIMD_RETURN (vfloat8, mask[i] ? 0.0f : a[i]);
9013 #endif
9014 }
9015 
9016 
9017 OIIO_FORCEINLINE vfloat8 select (const vbool8& mask, const vfloat8& a, const vfloat8& b)
9018 {
9019     return blend (b, a, mask);
9020 }
9021 
9022 
9023 OIIO_FORCEINLINE vfloat8 safe_div (const vfloat8 &a, const vfloat8 &b) {
9024 #if OIIO_SIMD_SSE
9025     return blend0not (a/b, b == vfloat8::Zero());
9026 #else
9027     SIMD_RETURN (vfloat8, b[i] == 0.0f ? 0.0f : a[i] / b[i]);
9028 #endif
9029 }
9030 
9031 
9032 OIIO_FORCEINLINE vfloat8 abs (const vfloat8& a)
9033 {
9034 #if OIIO_SIMD_AVX
9035     // Just clear the sign bit for cheap fabsf
9036     return _mm256_and_ps (a.simd(), _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
9037 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9038     return vfloat8(abs(a.lo()), abs(a.hi()));
9039 #else
9040     SIMD_RETURN (vfloat8, fabsf(a[i]));
9041 #endif
9042 }
9043 
9044 
9045 OIIO_FORCEINLINE vfloat8 sign (const vfloat8& a)
9046 {
9047     vfloat8 one(1.0f);
9048     return blend (one, -one, a < vfloat8::Zero());
9049 }
9050 
9051 
9052 OIIO_FORCEINLINE vfloat8 ceil (const vfloat8& a)
9053 {
9054 #if OIIO_SIMD_AVX
9055     return _mm256_ceil_ps (a);
9056 #else
9057     SIMD_RETURN (vfloat8, ceilf(a[i]));
9058 #endif
9059 }
9060 
9061 OIIO_FORCEINLINE vfloat8 floor (const vfloat8& a)
9062 {
9063 #if OIIO_SIMD_AVX
9064     return _mm256_floor_ps (a);
9065 #else
9066     SIMD_RETURN (vfloat8, floorf(a[i]));
9067 #endif
9068 }
9069 
9070 OIIO_FORCEINLINE vfloat8 round (const vfloat8& a)
9071 {
9072 #if OIIO_SIMD_AVX
9073     return _mm256_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
9074 #else
9075     SIMD_RETURN (vfloat8, roundf(a[i]));
9076 #endif
9077 }
9078 
9079 OIIO_FORCEINLINE vint8 ifloor (const vfloat8& a)
9080 {
9081     // FIXME: look into this, versus the method of quick_floor in texturesys.cpp
9082 #if OIIO_SIMD_AVX
9083     return vint8(floor(a));
9084 #elif OIIO_SIMD_SSE   /* SSE2/3 */
9085     return vint8 (ifloor(a.lo()), ifloor(a.hi()));
9086 #else
9087     SIMD_RETURN (vint8, (int)floorf(a[i]));
9088 #endif
9089 }
9090 
9091 
9092 OIIO_FORCEINLINE vint8 rint (const vfloat8& a)
9093 {
9094     return vint8 (round(a));
9095 }
9096 
9097 
9098 
9099 OIIO_FORCEINLINE vfloat8 rcp_fast (const vfloat8 &a)
9100 {
9101 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED
9102     vfloat8 r = _mm256_rcp14_ps(a);
9103     return r * nmadd(r,a,vfloat8(2.0f));
9104 #elif OIIO_SIMD_AVX
9105     vfloat8 r = _mm256_rcp_ps(a);
9106     return r * nmadd(r,a,vfloat8(2.0f));
9107 #else
9108     return vfloat8(rcp_fast(a.lo()), rcp_fast(a.hi()));
9109 #endif
9110 }
9111 
9112 
9113 OIIO_FORCEINLINE vfloat8 sqrt (const vfloat8 &a)
9114 {
9115 #if OIIO_SIMD_AVX
9116     return _mm256_sqrt_ps (a.simd());
9117 #else
9118     SIMD_RETURN (vfloat8, sqrtf(a[i]));
9119 #endif
9120 }
9121 
9122 
9123 
9124 OIIO_FORCEINLINE vfloat8 rsqrt (const vfloat8 &a)
9125 {
9126 #if OIIO_SIMD_AVX
9127     return _mm256_div_ps (_mm256_set1_ps(1.0f), _mm256_sqrt_ps (a.simd()));
9128 #else
9129     SIMD_RETURN (vfloat8, 1.0f/sqrtf(a[i]));
9130 #endif
9131 }
9132 
9133 
9134 
9135 OIIO_FORCEINLINE vfloat8 rsqrt_fast (const vfloat8 &a)
9136 {
9137 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
9138     // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
9139     return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
9140 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
9141     // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
9142     return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a)));
9143 #elif OIIO_SIMD_AVX
9144     return _mm256_rsqrt_ps (a.simd());
9145 #elif OIIO_SIMD_SSE
9146     return vfloat8 (rsqrt_fast(a.lo()), rsqrt_fast(a.hi()));
9147 #else
9148     SIMD_RETURN (vfloat8, 1.0f/sqrtf(a[i]));
9149 #endif
9150 }
9151 
9152 
9153 
9154 OIIO_FORCEINLINE vfloat8 min (const vfloat8& a, const vfloat8& b)
9155 {
9156 #if OIIO_SIMD_AVX
9157     return _mm256_min_ps (a, b);
9158 #else
9159     return vfloat8 (min(a.lo(), b.lo()), min(a.hi(), b.hi()));
9160 #endif
9161 }
9162 
9163 OIIO_FORCEINLINE vfloat8 max (const vfloat8& a, const vfloat8& b)
9164 {
9165 #if OIIO_SIMD_AVX
9166     return _mm256_max_ps (a, b);
9167 #else
9168     return vfloat8 (max(a.lo(), b.lo()), max(a.hi(), b.hi()));
9169 #endif
9170 }
9171 
9172 
9173 OIIO_FORCEINLINE vfloat8 andnot (const vfloat8& a, const vfloat8& b) {
9174 #if OIIO_SIMD_AVX
9175     return _mm256_andnot_ps (a.simd(), b.simd());
9176 #else
9177     const int *ai = (const int *)&a;
9178     const int *bi = (const int *)&b;
9179     return bitcast_to_float (vint8(~(ai[0]) & bi[0],
9180                                   ~(ai[1]) & bi[1],
9181                                   ~(ai[2]) & bi[2],
9182                                   ~(ai[3]) & bi[3],
9183                                   ~(ai[4]) & bi[4],
9184                                   ~(ai[5]) & bi[5],
9185                                   ~(ai[6]) & bi[6],
9186                                   ~(ai[7]) & bi[7]));
9187 #endif
9188 }
9189 
9190 
9191 OIIO_FORCEINLINE vfloat8 madd (const simd::vfloat8& a, const simd::vfloat8& b,
9192                               const simd::vfloat8& c)
9193 {
9194 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9195     // If we are sure _mm256_fmadd_ps intrinsic is available, use it.
9196     return _mm256_fmadd_ps (a, b, c);
9197 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9198     return vfloat8 (madd(a.lo(), b.lo(), c.lo()),
9199                     madd(a.hi(), b.hi(), c.hi()));
9200 #else
9201     // Fallback: just use regular math and hope for the best.
9202     return a * b + c;
9203 #endif
9204 }
9205 
9206 
9207 OIIO_FORCEINLINE vfloat8 msub (const simd::vfloat8& a, const simd::vfloat8& b,
9208                               const simd::vfloat8& c)
9209 {
9210 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9211     // If we are sure _mm256_fnmsub_ps intrinsic is available, use it.
9212     return _mm256_fmsub_ps (a, b, c);
9213 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9214     return vfloat8 (msub(a.lo(), b.lo(), c.lo()),
9215                     msub(a.hi(), b.hi(), c.hi()));
9216 #else
9217     // Fallback: just use regular math and hope for the best.
9218     return a * b - c;
9219 #endif
9220 }
9221 
9222 
9223 
9224 OIIO_FORCEINLINE vfloat8 nmadd (const simd::vfloat8& a, const simd::vfloat8& b,
9225                                const simd::vfloat8& c)
9226 {
9227 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9228     // If we are sure _mm256_fnmadd_ps intrinsic is available, use it.
9229     return _mm256_fnmadd_ps (a, b, c);
9230 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9231     return vfloat8 (nmadd(a.lo(), b.lo(), c.lo()),
9232                     nmadd(a.hi(), b.hi(), c.hi()));
9233 #else
9234     // Fallback: just use regular math and hope for the best.
9235     return c - a * b;
9236 #endif
9237 }
9238 
9239 
9240 
9241 OIIO_FORCEINLINE vfloat8 nmsub (const simd::vfloat8& a, const simd::vfloat8& b,
9242                                const simd::vfloat8& c)
9243 {
9244 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9245     // If we are sure _mm256_fnmsub_ps intrinsic is available, use it.
9246     return _mm256_fnmsub_ps (a, b, c);
9247 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9248     return vfloat8 (nmsub(a.lo(), b.lo(), c.lo()),
9249                     nmsub(a.hi(), b.hi(), c.hi()));
9250 #else
9251     // Fallback: just use regular math and hope for the best.
9252     return -(a * b) - c;
9253 #endif
9254 }
9255 
9256 
9257 
9258 
9259 //////////////////////////////////////////////////////////////////////
9260 // vfloat16 implementation
9261 
9262 OIIO_FORCEINLINE float& vfloat16::operator[] (int i) {
9263     OIIO_DASSERT(i<elements);
9264     return m_val[i];
9265 }
9266 
9267 OIIO_FORCEINLINE float vfloat16::operator[] (int i) const {
9268     OIIO_DASSERT(i<elements);
9269     return m_val[i];
9270 }
9271 
9272 
9273 inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val) {
9274     cout << val[0];
9275     for (int i = 1; i < val.elements; ++i)
9276         cout << ' ' << val[i];
9277     return cout;
9278 }
9279 
9280 
9281 OIIO_FORCEINLINE vfloat8 vfloat16::lo () const {
9282 #if OIIO_SIMD_AVX >= 512
9283     return _mm512_castps512_ps256 (simd());
9284 #else
9285     return m_8[0];
9286 #endif
9287 }
9288 
9289 OIIO_FORCEINLINE vfloat8 vfloat16::hi () const {
9290 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512DQ_ENABLED
9291     return _mm512_extractf32x8_ps (simd(), 1);
9292 #else
9293     return m_8[1];
9294 #endif
9295 }
9296 
9297 
9298 OIIO_FORCEINLINE vfloat16::vfloat16 (float v0, float v1, float v2, float v3,
9299                                    float v4, float v5, float v6, float v7,
9300                                    float v8, float v9, float v10, float v11,
9301                                    float v12, float v13, float v14, float v15) {
9302     load (v0, v1, v2, v3, v4, v5, v6, v7,
9303           v8, v9, v10, v11, v12, v13, v14, v15);
9304 }
9305 
9306 OIIO_FORCEINLINE vfloat16::vfloat16 (const vfloat8& lo, const vfloat8 &hi) {
9307 #if OIIO_SIMD_AVX >= 512
9308     __m512 r = _mm512_castps256_ps512 (lo);
9309     m_simd = _mm512_insertf32x8 (r, hi, 1);
9310 #else
9311     m_8[0] = lo;
9312     m_8[1] = hi;
9313 #endif
9314 }
9315 
9316 OIIO_FORCEINLINE vfloat16::vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d) {
9317 #if OIIO_SIMD_AVX >= 512
9318     m_simd = _mm512_broadcast_f32x4(a);
9319     m_simd = _mm512_insertf32x4 (m_simd, b, 1);
9320     m_simd = _mm512_insertf32x4 (m_simd, c, 2);
9321     m_simd = _mm512_insertf32x4 (m_simd, d, 3);
9322 #else
9323     m_8[0] = vfloat8(a,b);
9324     m_8[1] = vfloat8(c,d);
9325 #endif
9326 }
9327 
9328 
9329 OIIO_FORCEINLINE vfloat16::vfloat16 (const vint16& ival) {
9330 #if OIIO_SIMD_AVX >= 512
9331     m_simd = _mm512_cvtepi32_ps (ival);
9332 #else
9333     SIMD_CONSTRUCT (float(ival[i]));
9334 #endif
9335 }
9336 
9337 
9338 OIIO_FORCEINLINE const vfloat16 vfloat16::Zero () {
9339 #if OIIO_SIMD_AVX >= 512
9340     return _mm512_setzero_ps();
9341 #else
9342     return vfloat16(0.0f);
9343 #endif
9344 }
9345 
9346 OIIO_FORCEINLINE const vfloat16 vfloat16::One () {
9347     return vfloat16(1.0f);
9348 }
9349 
9350 OIIO_FORCEINLINE const vfloat16 vfloat16::Iota (float start, float step) {
9351     return vfloat16 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step,
9352                     start+4.0f*step, start+5.0f*step, start+6.0f*step, start+7.0f*step,
9353                     start+8.0f*step, start+9.0f*step, start+10.0f*step, start+11.0f*step,
9354                     start+12.0f*step, start+13.0f*step, start+14.0f*step, start+15.0f*step);
9355 }
9356 
9357 /// Set all components to 0.0
9358 OIIO_FORCEINLINE void vfloat16::clear () {
9359 #if OIIO_SIMD_AVX >= 512
9360     m_simd = _mm512_setzero_ps();
9361 #else
9362     load (0.0f);
9363 #endif
9364 }
9365 
9366 
9367 OIIO_FORCEINLINE void vfloat16::load (float a) {
9368 #if OIIO_SIMD_AVX >= 512
9369     m_simd = _mm512_set1_ps (a);
9370 #else
9371     m_8[0].load (a);
9372     m_8[1].load (a);
9373 #endif
9374 }
9375 
9376 
9377 OIIO_FORCEINLINE void vfloat16::load (float v0, float v1, float v2, float v3,
9378                                      float v4, float v5, float v6, float v7,
9379                                      float v8, float v9, float v10, float v11,
9380                                      float v12, float v13, float v14, float v15) {
9381 #if OIIO_SIMD_AVX >= 512
9382     m_simd = _mm512_setr_ps (v0, v1, v2, v3, v4, v5, v6, v7,
9383                              v8, v9, v10, v11, v12, v13, v14, v15);
9384 #else
9385     m_val[ 0] = v0;
9386     m_val[ 1] = v1;
9387     m_val[ 2] = v2;
9388     m_val[ 3] = v3;
9389     m_val[ 4] = v4;
9390     m_val[ 5] = v5;
9391     m_val[ 6] = v6;
9392     m_val[ 7] = v7;
9393     m_val[ 8] = v8;
9394     m_val[ 9] = v9;
9395     m_val[10] = v10;
9396     m_val[11] = v11;
9397     m_val[12] = v12;
9398     m_val[13] = v13;
9399     m_val[14] = v14;
9400     m_val[15] = v15;
9401 #endif
9402 }
9403 
9404 
9405 OIIO_FORCEINLINE void vfloat16::load (const float *values) {
9406 #if OIIO_SIMD_AVX >= 512
9407     m_simd = _mm512_loadu_ps (values);
9408 #else
9409     m_8[0].load (values);
9410     m_8[1].load (values+8);
9411 #endif
9412 }
9413 
9414 
9415 OIIO_FORCEINLINE void vfloat16::load (const float *values, int n)
9416 {
9417     OIIO_DASSERT (n >= 0 && n <= elements);
9418 #if OIIO_SIMD_AVX >= 512
9419     m_simd = _mm512_maskz_loadu_ps (__mmask16(~(0xffff << n)), values);
9420 #else
9421     if (n > 8) {
9422         m_8[0].load (values);
9423         m_8[1].load (values+8, n-8);
9424     } else {
9425         m_8[0].load (values, n);
9426         m_8[1].clear ();
9427     }
9428 #endif
9429 }
9430 
9431 
9432 OIIO_FORCEINLINE void vfloat16::load (const unsigned short *values) {
9433 #if OIIO_SIMD_AVX >= 512
9434     // Rely on the ushort->int conversion, then convert to float
9435     m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9436 #else
9437     m_8[0].load (values);
9438     m_8[1].load (values+8);
9439 #endif
9440 }
9441 
9442 
9443 OIIO_FORCEINLINE void vfloat16::load (const short *values) {
9444 #if OIIO_SIMD_AVX >= 512
9445     // Rely on the short->int conversion, then convert to float
9446     m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9447 #else
9448     m_8[0].load (values);
9449     m_8[1].load (values+8);
9450 #endif
9451 }
9452 
9453 
9454 OIIO_FORCEINLINE void vfloat16::load (const unsigned char *values) {
9455 #if OIIO_SIMD_AVX >= 512
9456     m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9457 #else
9458     m_8[0].load (values);
9459     m_8[1].load (values+8);
9460 #endif
9461 }
9462 
9463 
9464 OIIO_FORCEINLINE void vfloat16::load (const char *values) {
9465 #if OIIO_SIMD_AVX >= 512
9466     m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9467 #else
9468     m_8[0].load (values);
9469     m_8[1].load (values+8);
9470 #endif
9471 }
9472 
9473 
9474 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
9475 OIIO_FORCEINLINE void vfloat16::load (const half *values) {
9476 #if OIIO_SIMD_AVX >= 512
9477     /* Enabled 16 bit float instructions! */
9478     vint8 a ((const int *)values);
9479     m_simd = _mm512_cvtph_ps (a);
9480 #else
9481     m_8[0].load (values);
9482     m_8[1].load (values+8);
9483 #endif
9484 }
9485 #endif /* _HALF_H_ or _IMATH_H_ */
9486 
9487 
9488 
9489 OIIO_FORCEINLINE void vfloat16::store (float *values) const {
9490 #if OIIO_SIMD_AVX >= 512
9491     // Use an unaligned store -- it's just as fast when the memory turns
9492     // out to be aligned, nearly as fast even when unaligned. Not worth
9493     // the headache of using stores that require alignment.
9494     _mm512_storeu_ps (values, m_simd);
9495 #else
9496     m_8[0].store (values);
9497     m_8[1].store (values+8);
9498 #endif
9499 }
9500 
9501 
9502 OIIO_FORCEINLINE void vfloat16::store (float *values, int n) const {
9503     OIIO_DASSERT (n >= 0 && n <= elements);
9504     // FIXME: is this faster with AVX masked stores?
9505 #if 0 && OIIO_SIMD_AVX >= 512
9506     // This SHOULD be fast, but in my benchmarks, it is slower!
9507     // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
9508     // Re-test this periodically with new Intel hardware.
9509     _mm512_mask_storeu_ps (values, __mmask16(~(0xffff << n)), m_simd);
9510 #else
9511     if (n <= 8) {
9512         lo().store (values, n);
9513     } else if (n < 16) {
9514         lo().store (values);
9515         hi().store (values+8, n-8);
9516     } else {
9517         store (values);
9518     }
9519 #endif
9520 }
9521 
9522 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
9523 OIIO_FORCEINLINE void vfloat16::store (half *values) const {
9524 #if OIIO_SIMD_AVX >= 512
9525     __m256i h = _mm512_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
9526     _mm256_storeu_si256 ((__m256i *)values, h);
9527 #else
9528     m_8[0].store (values);
9529     m_8[1].store (values+8);
9530 #endif
9531 }
9532 #endif
9533 
9534 
9535 OIIO_FORCEINLINE void vfloat16::load_mask (const vbool16 &mask, const float *values) {
9536 #if OIIO_SIMD_AVX >= 512
9537     m_simd = _mm512_maskz_loadu_ps (mask, (const simd_t *)values);
9538 #else
9539     m_8[0].load_mask (mask.lo(), values);
9540     m_8[1].load_mask (mask.hi(), values+8);
9541 #endif
9542 }
9543 
9544 
9545 OIIO_FORCEINLINE void vfloat16::store_mask (const vbool16 &mask, float *values) const {
9546 #if OIIO_SIMD_AVX >= 512
9547     _mm512_mask_storeu_ps (values, mask.bitmask(), m_simd);
9548 #else
9549     lo().store_mask (mask.lo(), values);
9550     hi().store_mask (mask.hi(), values+8);
9551 #endif
9552 }
9553 
9554 
9555 
9556 template <int scale>
9557 OIIO_FORCEINLINE void
9558 vfloat16::gather (const value_t *baseptr, const vint_t& vindex)
9559 {
9560 #if OIIO_SIMD_AVX >= 512
9561     m_simd = _mm512_i32gather_ps (vindex, baseptr, scale);
9562 #else
9563     m_8[0].gather<scale> (baseptr, vindex.lo());
9564     m_8[1].gather<scale> (baseptr, vindex.hi());
9565 #endif
9566 }
9567 
9568 template<int scale>
9569 OIIO_FORCEINLINE void
9570 vfloat16::gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex)
9571 {
9572 #if OIIO_SIMD_AVX >= 512
9573     m_simd = _mm512_mask_i32gather_ps (m_simd, mask, vindex, baseptr, scale);
9574 #else
9575     m_8[0].gather_mask<scale> (mask.lo(), baseptr, vindex.lo());
9576     m_8[1].gather_mask<scale> (mask.hi(), baseptr, vindex.hi());
9577 #endif
9578 }
9579 
9580 template<int scale>
9581 OIIO_FORCEINLINE void
9582 vfloat16::scatter (value_t *baseptr, const vint_t& vindex) const
9583 {
9584 #if OIIO_SIMD_AVX >= 512
9585     _mm512_i32scatter_ps (baseptr, vindex, m_simd, scale);
9586 #else
9587     lo().scatter<scale> (baseptr, vindex.lo());
9588     hi().scatter<scale> (baseptr, vindex.hi());
9589 #endif
9590 }
9591 
9592 template<int scale>
9593 OIIO_FORCEINLINE void
9594 vfloat16::scatter_mask (const bool_t& mask, value_t *baseptr,
9595                         const vint_t& vindex) const
9596 {
9597 #if OIIO_SIMD_AVX >= 512
9598     _mm512_mask_i32scatter_ps (baseptr, mask, vindex, m_simd, scale);
9599 #else
9600     lo().scatter_mask<scale> (mask.lo(), baseptr, vindex.lo());
9601     hi().scatter_mask<scale> (mask.hi(), baseptr, vindex.hi());
9602 #endif
9603 }
9604 
9605 
9606 
9607 OIIO_FORCEINLINE vfloat16 operator+ (const vfloat16& a, const vfloat16& b) {
9608 #if OIIO_SIMD_AVX >= 512
9609     return _mm512_add_ps (a.m_simd, b.m_simd);
9610 #else
9611     return vfloat16 (a.lo()+b.lo(), a.hi()+b.hi());
9612 #endif
9613 }
9614 
9615 OIIO_FORCEINLINE const vfloat16 & operator+= (vfloat16& a, const vfloat16& b) {
9616     return a = a + b;
9617 }
9618 
9619 OIIO_FORCEINLINE vfloat16 operator- (const vfloat16& a) {
9620 #if OIIO_SIMD_AVX >= 512
9621     return _mm512_sub_ps (_mm512_setzero_ps(), a.simd());
9622 #else
9623     return vfloat16 (-a.lo(), -a.hi());
9624 #endif
9625 }
9626 
9627 OIIO_FORCEINLINE vfloat16 operator- (const vfloat16& a, const vfloat16& b) {
9628 #if OIIO_SIMD_AVX >= 512
9629     return _mm512_sub_ps (a.m_simd, b.m_simd);
9630 #else
9631     return vfloat16 (a.lo()-b.lo(), a.hi()-b.hi());
9632 #endif
9633 }
9634 
9635 OIIO_FORCEINLINE const vfloat16 & operator-= (vfloat16& a, const vfloat16& b) {
9636     return a = a - b;
9637 }
9638 
9639 
9640 OIIO_FORCEINLINE vfloat16 operator* (const vfloat16& a, float b) {
9641 #if OIIO_SIMD_AVX >= 512
9642     return _mm512_mul_ps (a.m_simd, _mm512_set1_ps(b));
9643 #else
9644     return vfloat16 (a.lo()*b, a.hi()*b);
9645 #endif
9646 }
9647 
9648 OIIO_FORCEINLINE vfloat16 operator* (float a, const vfloat16& b) {
9649     return b * a;
9650 }
9651 
9652 OIIO_FORCEINLINE vfloat16 operator* (const vfloat16& a, const vfloat16& b) {
9653 #if OIIO_SIMD_AVX >= 512
9654     return _mm512_mul_ps (a.m_simd, b.m_simd);
9655 #else
9656     return vfloat16 (a.lo()*b.lo(), a.hi()*b.hi());
9657 #endif
9658 }
9659 
9660 OIIO_FORCEINLINE const vfloat16 & operator*= (vfloat16& a, const vfloat16& b) {
9661     return a = a * b;
9662 }
9663 
9664 OIIO_FORCEINLINE vfloat16 operator/ (const vfloat16& a, const vfloat16& b) {
9665 #if OIIO_SIMD_AVX >= 512
9666     return _mm512_div_ps (a.m_simd, b.m_simd);
9667 #else
9668     return vfloat16 (a.lo()/b.lo(), a.hi()/b.hi());
9669 #endif
9670 }
9671 
9672 OIIO_FORCEINLINE const vfloat16 & operator/= (vfloat16& a, const vfloat16& b) {
9673     return a = a / b;
9674 }
9675 
9676 
9677 OIIO_FORCEINLINE vbool16 operator== (const vfloat16& a, const vfloat16& b) {
9678 #if OIIO_SIMD_AVX >= 512
9679     return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_EQ_OQ);
9680 #else  /* Fall back to 8-wide */
9681     return vbool16 (a.lo() == b.lo(), a.hi() == b.hi());
9682 #endif
9683 }
9684 
9685 
9686 OIIO_FORCEINLINE vbool16 operator!= (const vfloat16& a, const vfloat16& b) {
9687 #if OIIO_SIMD_AVX >= 512
9688     return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_NEQ_OQ);
9689 #else  /* Fall back to 8-wide */
9690     return vbool16 (a.lo() != b.lo(), a.hi() != b.hi());
9691 #endif
9692 }
9693 
9694 
9695 OIIO_FORCEINLINE vbool16 operator< (const vfloat16& a, const vfloat16& b) {
9696 #if OIIO_SIMD_AVX >= 512
9697     return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_LT_OQ);
9698 #else  /* Fall back to 8-wide */
9699     return vbool16 (a.lo() < b.lo(), a.hi() < b.hi());
9700 #endif
9701 }
9702 
9703 
9704 OIIO_FORCEINLINE vbool16 operator> (const vfloat16& a, const vfloat16& b) {
9705 #if OIIO_SIMD_AVX >= 512
9706     return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_GT_OQ);
9707 #else  /* Fall back to 8-wide */
9708     return vbool16 (a.lo() > b.lo(), a.hi() > b.hi());
9709 #endif
9710 }
9711 
9712 
9713 OIIO_FORCEINLINE vbool16 operator>= (const vfloat16& a, const vfloat16& b) {
9714 #if OIIO_SIMD_AVX >= 512
9715     return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_GE_OQ);
9716 #else  /* Fall back to 8-wide */
9717     return vbool16 (a.lo() >= b.lo(), a.hi() >= b.hi());
9718 #endif
9719 }
9720 
9721 
9722 OIIO_FORCEINLINE vbool16 operator<= (const vfloat16& a, const vfloat16& b) {
9723 #if OIIO_SIMD_AVX >= 512
9724     return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_LE_OQ);
9725 #else  /* Fall back to 8-wide */
9726     return vbool16 (a.lo() <= b.lo(), a.hi() <= b.hi());
9727 #endif
9728 }
9729 
9730 
9731 // Implementation had to be after the definition of vfloat16.
9732 OIIO_FORCEINLINE vint16::vint16 (const vfloat16& f)
9733 {
9734 #if OIIO_SIMD_AVX >= 512
9735     m_simd = _mm512_cvttps_epi32(f);
9736 #else
9737     *this = vint16 (vint8(f.lo()), vint8(f.hi()));
9738 #endif
9739 }
9740 
9741 
9742 
9743 // Shuffle groups of 4
9744 template<int i0, int i1, int i2, int i3>
9745 vfloat16 shuffle4 (const vfloat16& a) {
9746 #if OIIO_SIMD_AVX >= 512
9747     return _mm512_shuffle_f32x4(a,a,_MM_SHUFFLE(i3,i2,i1,i0));
9748 #else
9749     vfloat4 x[4];
9750     a.store ((float *)x);
9751     return vfloat16 (x[i0], x[i1], x[i2], x[i3]);
9752 #endif
9753 }
9754 
9755 template<int i> vfloat16 shuffle4 (const vfloat16& a) {
9756     return shuffle4<i,i,i,i> (a);
9757 }
9758 
9759 template<int i0, int i1, int i2, int i3>
9760 vfloat16 shuffle (const vfloat16& a) {
9761 #if OIIO_SIMD_AVX >= 512
9762     return _mm512_permute_ps(a,_MM_SHUFFLE(i3,i2,i1,i0));
9763 #else
9764     vfloat4 x[4];
9765     a.store ((float *)x);
9766     return vfloat16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]),
9767                     shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3]));
9768 #endif
9769 }
9770 
9771 template<int i> vfloat16 shuffle (const vfloat16& a) {
9772     return shuffle<i,i,i,i> (a);
9773 }
9774 
9775 
9776 template<int i>
9777 OIIO_FORCEINLINE float extract (const vfloat16& a) {
9778     return a[i];
9779 }
9780 
9781 
9782 template<int i>
9783 OIIO_FORCEINLINE vfloat16 insert (const vfloat16& a, float val) {
9784     vfloat16 tmp = a;
9785     tmp[i] = val;
9786     return tmp;
9787 }
9788 
9789 
9790 OIIO_FORCEINLINE float vfloat16::x () const {
9791 #if OIIO_SIMD_AVX >= 512
9792     return _mm_cvtss_f32(_mm512_castps512_ps128(m_simd));
9793 #else
9794     return m_val[0];
9795 #endif
9796 }
9797 
9798 OIIO_FORCEINLINE float vfloat16::y () const { return m_val[1]; }
9799 OIIO_FORCEINLINE float vfloat16::z () const { return m_val[2]; }
9800 OIIO_FORCEINLINE float vfloat16::w () const { return m_val[3]; }
9801 OIIO_FORCEINLINE void vfloat16::set_x (float val) { m_val[0] = val; }
9802 OIIO_FORCEINLINE void vfloat16::set_y (float val) { m_val[1] = val; }
9803 OIIO_FORCEINLINE void vfloat16::set_z (float val) { m_val[2] = val; }
9804 OIIO_FORCEINLINE void vfloat16::set_w (float val) { m_val[3] = val; }
9805 
9806 
9807 OIIO_FORCEINLINE vint16 bitcast_to_int (const vfloat16& x)
9808 {
9809 #if OIIO_SIMD_AVX >= 512
9810     return _mm512_castps_si512 (x.simd());
9811 #else
9812     return *(vint16 *)&x;
9813 #endif
9814 }
9815 
9816 OIIO_FORCEINLINE vfloat16 bitcast_to_float (const vint16& x)
9817 {
9818 #if OIIO_SIMD_AVX >= 512
9819     return _mm512_castsi512_ps (x.simd());
9820 #else
9821     return *(vfloat16 *)&x;
9822 #endif
9823 }
9824 
9825 
9826 OIIO_FORCEINLINE vfloat16 vreduce_add (const vfloat16& v) {
9827 #if OIIO_SIMD_AVX >= 512
9828     // Nomenclature: ABCD are the vint4's comprising v
9829     // First, add the vint4's and make them all the same
9830     vfloat16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(v);  // each adjacent vint4 is summed
9831     vfloat16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD);
9832     // Now, add within each vint4
9833     vfloat16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(w);  // each adjacent int is summed
9834     return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd);
9835 #else
9836     vfloat8 sum = vreduce_add(v.lo()) + vreduce_add(v.hi());
9837     return vfloat16 (sum, sum);
9838 #endif
9839 }
9840 
9841 
9842 OIIO_FORCEINLINE float reduce_add (const vfloat16& v) {
9843 #if OIIO_SIMD_AVX >= 512
9844     return vreduce_add(v).x();
9845 #else
9846     return reduce_add(v.lo()) + reduce_add(v.hi());
9847 #endif
9848 }
9849 
9850 
9851 OIIO_FORCEINLINE vfloat16 blend (const vfloat16& a, const vfloat16& b, const vbool16& mask)
9852 {
9853 #if OIIO_SIMD_AVX >= 512
9854     return _mm512_mask_blend_ps (mask, a, b);
9855 #else
9856     return vfloat16 (blend (a.lo(), b.lo(), mask.lo()),
9857                     blend (a.hi(), b.hi(), mask.hi()));
9858 #endif
9859 }
9860 
9861 
9862 OIIO_FORCEINLINE vfloat16 blend0 (const vfloat16& a, const vbool16& mask)
9863 {
9864 #if OIIO_SIMD_AVX >= 512
9865     return _mm512_maskz_mov_ps (mask, a);
9866 #else
9867     return vfloat16 (blend0 (a.lo(), mask.lo()),
9868                     blend0 (a.hi(), mask.hi()));
9869 #endif
9870 }
9871 
9872 
9873 OIIO_FORCEINLINE vfloat16 blend0not (const vfloat16& a, const vbool16& mask)
9874 {
9875 #if OIIO_SIMD_AVX >= 512
9876     return _mm512_maskz_mov_ps (!mask, a);
9877 #else
9878     return vfloat16 (blend0not (a.lo(), mask.lo()),
9879                     blend0not (a.hi(), mask.hi()));
9880 #endif
9881 }
9882 
9883 
9884 OIIO_FORCEINLINE vfloat16 select (const vbool16& mask, const vfloat16& a, const vfloat16& b)
9885 {
9886     return blend (b, a, mask);
9887 }
9888 
9889 
9890 OIIO_FORCEINLINE vfloat16 safe_div (const vfloat16 &a, const vfloat16 &b) {
9891 #if OIIO_SIMD_SSE
9892     return blend0not (a/b, b == vfloat16::Zero());
9893 #else
9894     SIMD_RETURN (vfloat16, b[i] == 0.0f ? 0.0f : a[i] / b[i]);
9895 #endif
9896 }
9897 
9898 
9899 OIIO_FORCEINLINE vfloat16 abs (const vfloat16& a)
9900 {
9901 #if OIIO_SIMD_AVX >= 512
9902     // Not available?  return _mm512_abs_ps (a.simd());
9903     // Just clear the sign bit for cheap fabsf
9904     return _mm512_castsi512_ps (_mm512_and_epi32 (_mm512_castps_si512(a.simd()),
9905                                                   _mm512_set1_epi32(0x7fffffff)));
9906 #else
9907     return vfloat16(abs(a.lo()), abs(a.hi()));
9908 #endif
9909 }
9910 
9911 
9912 OIIO_FORCEINLINE vfloat16 sign (const vfloat16& a)
9913 {
9914     vfloat16 one(1.0f);
9915     return blend (one, -one, a < vfloat16::Zero());
9916 }
9917 
9918 
9919 OIIO_FORCEINLINE vfloat16 ceil (const vfloat16& a)
9920 {
9921 #if OIIO_SIMD_AVX >= 512
9922     return _mm512_ceil_ps (a);
9923 #else
9924     return vfloat16(ceil(a.lo()), ceil(a.hi()));
9925 #endif
9926 }
9927 
9928 OIIO_FORCEINLINE vfloat16 floor (const vfloat16& a)
9929 {
9930 #if OIIO_SIMD_AVX >= 512
9931     return _mm512_floor_ps (a);
9932 #else
9933     return vfloat16(floor(a.lo()), floor(a.hi()));
9934 #endif
9935 }
9936 
9937 
9938 OIIO_FORCEINLINE vfloat16 round (const vfloat16& a)
9939 {
9940 #if OIIO_SIMD_AVX >= 512
9941     return _mm512_roundscale_ps (a, (1<<4) | 3); // scale=1, round to nearest smaller mag int
9942 #else
9943     return vfloat16(round(a.lo()), round(a.hi()));
9944 #endif
9945 }
9946 
9947 OIIO_FORCEINLINE vint16 ifloor (const vfloat16& a)
9948 {
9949 #if OIIO_SIMD_AVX >= 512
9950     return _mm512_cvt_roundps_epi32 (a, (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC));
9951 #else
9952     return vint16(floor(a));
9953 #endif
9954 }
9955 
9956 
9957 OIIO_FORCEINLINE vint16 rint (const vfloat16& a)
9958 {
9959     return vint16(round(a));
9960 }
9961 
9962 
9963 OIIO_FORCEINLINE vfloat16 rcp_fast (const vfloat16 &a)
9964 {
9965 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
9966     return _mm512_rcp28_ps(a);
9967 #elif OIIO_SIMD_AVX >= 512
9968     vfloat16 r = _mm512_rcp14_ps(a);
9969     return r * nmadd (r, a, vfloat16(2.0f));
9970 #else
9971     return vfloat16(rcp_fast(a.lo()), rcp_fast(a.hi()));
9972 #endif
9973 }
9974 
9975 
9976 OIIO_FORCEINLINE vfloat16 sqrt (const vfloat16 &a)
9977 {
9978 #if OIIO_SIMD_AVX >= 512
9979     return _mm512_sqrt_ps (a);
9980 #else
9981     return vfloat16(sqrt(a.lo()), sqrt(a.hi()));
9982 #endif
9983 }
9984 
9985 
9986 OIIO_FORCEINLINE vfloat16 rsqrt (const vfloat16 &a)
9987 {
9988 #if OIIO_SIMD_AVX >= 512
9989     return _mm512_div_ps (_mm512_set1_ps(1.0f), _mm512_sqrt_ps (a));
9990 #else
9991     return vfloat16(rsqrt(a.lo()), rsqrt(a.hi()));
9992 #endif
9993 }
9994 
9995 
9996 OIIO_FORCEINLINE vfloat16 rsqrt_fast (const vfloat16 &a)
9997 {
9998 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
9999     return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
10000 #elif OIIO_SIMD_AVX >= 512
10001     return _mm512_rsqrt14_ps (a);
10002 #else
10003     return vfloat16(rsqrt_fast(a.lo()), rsqrt_fast(a.hi()));
10004 #endif
10005 }
10006 
10007 
10008 OIIO_FORCEINLINE vfloat16 min (const vfloat16& a, const vfloat16& b)
10009 {
10010 #if OIIO_SIMD_AVX >= 512
10011     return _mm512_min_ps (a, b);
10012 #else
10013     return vfloat16(min(a.lo(),b.lo()), min(a.hi(),b.hi()));
10014 #endif
10015 }
10016 
10017 OIIO_FORCEINLINE vfloat16 max (const vfloat16& a, const vfloat16& b)
10018 {
10019 #if OIIO_SIMD_AVX >= 512
10020     return _mm512_max_ps (a, b);
10021 #else
10022     return vfloat16(max(a.lo(),b.lo()), max(a.hi(),b.hi()));
10023 #endif
10024 }
10025 
10026 
10027 OIIO_FORCEINLINE vfloat16 andnot (const vfloat16& a, const vfloat16& b) {
10028 #if OIIO_SIMD_AVX >= 512 && defined(__AVX512DQ__)
10029     return _mm512_andnot_ps (a, b);
10030 #else
10031     return vfloat16(andnot(a.lo(),b.lo()), andnot(a.hi(),b.hi()));
10032 #endif
10033 }
10034 
10035 
10036 OIIO_FORCEINLINE vfloat16 madd (const simd::vfloat16& a, const simd::vfloat16& b,
10037                                const simd::vfloat16& c)
10038 {
10039 #if OIIO_SIMD_AVX >= 512
10040     return _mm512_fmadd_ps (a, b, c);
10041 #else
10042     return vfloat16 (madd(a.lo(), b.lo(), c.lo()),
10043                     madd(a.hi(), b.hi(), c.hi()));
10044 #endif
10045 }
10046 
10047 
10048 OIIO_FORCEINLINE vfloat16 msub (const simd::vfloat16& a, const simd::vfloat16& b,
10049                                const simd::vfloat16& c)
10050 {
10051 #if OIIO_SIMD_AVX >= 512
10052     return _mm512_fmsub_ps (a, b, c);
10053 #else
10054     return vfloat16 (msub(a.lo(), b.lo(), c.lo()),
10055                     msub(a.hi(), b.hi(), c.hi()));
10056 #endif
10057 }
10058 
10059 
10060 
10061 OIIO_FORCEINLINE vfloat16 nmadd (const simd::vfloat16& a, const simd::vfloat16& b,
10062                                 const simd::vfloat16& c)
10063 {
10064 #if OIIO_SIMD_AVX >= 512
10065     return _mm512_fnmadd_ps (a, b, c);
10066 #else
10067     return vfloat16 (nmadd(a.lo(), b.lo(), c.lo()),
10068                     nmadd(a.hi(), b.hi(), c.hi()));
10069 #endif
10070 }
10071 
10072 
10073 
10074 OIIO_FORCEINLINE vfloat16 nmsub (const simd::vfloat16& a, const simd::vfloat16& b,
10075                                 const simd::vfloat16& c)
10076 {
10077 #if OIIO_SIMD_AVX >= 512
10078     return _mm512_fnmsub_ps (a, b, c);
10079 #else
10080     return vfloat16 (nmsub(a.lo(), b.lo(), c.lo()),
10081                     nmsub(a.hi(), b.hi(), c.hi()));
10082 #endif
10083 }
10084 
10085 
10086 
10087 
10088 } // end namespace simd
10089 
10090 OIIO_NAMESPACE_END
10091 
10092 
10093 #undef SIMD_DO
10094 #undef SIMD_CONSTRUCT
10095 #undef SIMD_CONSTRUCT_PAD
10096 #undef SIMD_RETURN
10097 #undef SIMD_RETURN_REDUCE
10098