1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio/blob/master/LICENSE.md
4 
5 // clang-format off
6 
7 #include <sstream>
8 #include <type_traits>
9 
10 #include <OpenImageIO/argparse.h>
11 #include <OpenImageIO/benchmark.h>
12 #include <OpenImageIO/fmath.h>
13 #include <OpenImageIO/imageio.h>
14 #include <OpenImageIO/simd.h>
15 #include <OpenImageIO/strutil.h>
16 #include <OpenImageIO/timer.h>
17 #include <OpenImageIO/typedesc.h>
18 #include <OpenImageIO/unittest.h>
19 #include <OpenImageIO/ustring.h>
20 
21 
22 
23 using namespace OIIO;
24 
25 using namespace OIIO::simd;
26 
27 
28 static int iterations = 1000000;
29 static int ntrials    = 5;
30 static Sysutil::Term term(std::cout);
31 OIIO_SIMD16_ALIGN float dummy_float[16];
32 OIIO_SIMD16_ALIGN float dummy_float2[16];
33 OIIO_SIMD16_ALIGN float dummy_int[16];
34 
35 
36 
37 static void
getargs(int argc,char * argv[])38 getargs(int argc, char* argv[])
39 {
40     ArgParse ap;
41     ap.intro("simd_test -- unit test and benchmarks for OpenImageIO/simd.h\n"
42              OIIO_INTRO_STRING)
43       .usage("simd_test [options]");
44 
45     ap.arg("--iterations %d", &iterations)
46       .help(Strutil::sprintf("Number of iterations (default: %d)", iterations));
47     ap.arg("--trials %d", &ntrials)
48       .help("Number of trials");
49 
50     ap.parse_args(argc, (const char**)argv);
51 }
52 
53 
54 
55 static void
category_heading(string_view name)56 category_heading(string_view name)
57 {
58     std::cout << "\n" << term.ansi("bold,underscore,yellow", name) << "\n\n";
59 }
60 
61 
62 
63 static void
test_heading(string_view name,string_view name2="")64 test_heading(string_view name, string_view name2 = "")
65 {
66     std::cout << term.ansi("bold") << name << ' ' << name2
67               << term.ansi("normal") << "\n";
68 }
69 
70 
71 
72 // What I really want to do is merge benchmark() and benchmark2() into
73 // one template using variadic arguments, like this:
74 //   template <typename FUNC, typename ...ARGS>
75 //   void benchmark (size_t work, string_view funcname, FUNC func, ARGS... args)
76 // But it seems that although this works for Clang, it does not for gcc 4.8
77 // (but does for 4.9). Some day I'll get back to this simplification, but
78 // for now, gcc 4.8 seems like an important barrier.
79 
80 
81 template<typename FUNC, typename T>
82 void
benchmark(string_view funcname,FUNC func,T x,size_t work=0)83 benchmark(string_view funcname, FUNC func, T x, size_t work = 0)
84 {
85     if (!work)
86         work = SimdElements<decltype(func(x))>::size;
87     auto repeat_func = [&](){
88         // Unroll the loop 8 times
89         auto r = func(x); DoNotOptimize (r); clobber_all_memory();
90         r = func(x); DoNotOptimize (r); clobber_all_memory();
91         r = func(x); DoNotOptimize (r); clobber_all_memory();
92         r = func(x); DoNotOptimize (r); clobber_all_memory();
93         r = func(x); DoNotOptimize (r); clobber_all_memory();
94         r = func(x); DoNotOptimize (r); clobber_all_memory();
95         r = func(x); DoNotOptimize (r); clobber_all_memory();
96         r = func(x); DoNotOptimize (r); clobber_all_memory();
97     };
98     float time = time_trial(repeat_func, ntrials, iterations / 8);
99     Strutil::printf("  %s: %7.1f Mvals/sec, (%.1f Mcalls/sec)\n",
100                                  funcname, ((iterations * work) / 1.0e6) / time,
101                                  (iterations / 1.0e6) / time);
102 }
103 
104 
105 template<typename FUNC, typename T, typename U>
106 void
benchmark2(string_view funcname,FUNC func,T x,U y,size_t work=0)107 benchmark2(string_view funcname, FUNC func, T x, U y, size_t work = 0)
108 {
109     if (!work)
110         work = SimdElements<decltype(func(x, y))>::size;
111     auto repeat_func = [&]() {
112         // Unroll the loop 8 times
113         auto r = func(x, y); DoNotOptimize (r); clobber_all_memory();
114         r = func(x, y); DoNotOptimize (r); clobber_all_memory();
115         r = func(x, y); DoNotOptimize (r); clobber_all_memory();
116         r = func(x, y); DoNotOptimize (r); clobber_all_memory();
117         r = func(x, y); DoNotOptimize (r); clobber_all_memory();
118         r = func(x, y); DoNotOptimize (r); clobber_all_memory();
119         r = func(x, y); DoNotOptimize (r); clobber_all_memory();
120         r = func(x, y); DoNotOptimize (r); clobber_all_memory();
121     };
122     float time = time_trial(repeat_func, ntrials, iterations / 8);
123     Strutil::printf("  %s: %7.1f Mvals/sec, (%.1f Mcalls/sec)\n",
124                                  funcname, ((iterations * work) / 1.0e6) / time,
125                                  (iterations / 1.0e6) / time);
126 }
127 
128 
129 
130 template<typename VEC>
131 inline VEC
mkvec(typename VEC::value_t a,typename VEC::value_t b,typename VEC::value_t c,typename VEC::value_t d=0)132 mkvec(typename VEC::value_t a, typename VEC::value_t b, typename VEC::value_t c,
133       typename VEC::value_t d = 0)
134 {
135     return VEC(a, b, c, d);
136 }
137 
138 template<>
139 inline vfloat3
mkvec(float a,float b,float c,float)140 mkvec<vfloat3>(float a, float b, float c, float /*d*/)
141 {
142     return vfloat3(a, b, c);
143 }
144 
145 template<>
146 inline vfloat8
mkvec(float a,float b,float c,float d)147 mkvec<vfloat8>(float a, float b, float c, float d)
148 {
149     return vfloat8(a, b, c, d, a, b, c, d);
150 }
151 
152 template<>
153 inline vfloat16
mkvec(float a,float b,float c,float d)154 mkvec<vfloat16>(float a, float b, float c, float d)
155 {
156     return vfloat16(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
157 }
158 
159 template<>
160 inline vint8
mkvec(int a,int b,int c,int d)161 mkvec<vint8>(int a, int b, int c, int d)
162 {
163     return vint8(a, b, c, d, a, b, c, d);
164 }
165 
166 template<>
167 inline vint16
mkvec(int a,int b,int c,int d)168 mkvec<vint16>(int a, int b, int c, int d)
169 {
170     return vint16(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
171 }
172 
173 template<>
174 inline vbool8
mkvec(bool a,bool b,bool c,bool d)175 mkvec<vbool8>(bool a, bool b, bool c, bool d)
176 {
177     return vbool8(a, b, c, d, a, b, c, d);
178 }
179 
180 template<>
181 inline vbool16
mkvec(bool a,bool b,bool c,bool d)182 mkvec<vbool16>(bool a, bool b, bool c, bool d)
183 {
184     return vbool16(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
185 }
186 
187 
188 
189 template<typename VEC>
190 inline VEC
mkvec(typename VEC::value_t a,typename VEC::value_t b,typename VEC::value_t c,typename VEC::value_t d,typename VEC::value_t e,typename VEC::value_t f,typename VEC::value_t g,typename VEC::value_t h)191 mkvec(typename VEC::value_t a, typename VEC::value_t b, typename VEC::value_t c,
192       typename VEC::value_t d, typename VEC::value_t e, typename VEC::value_t f,
193       typename VEC::value_t g, typename VEC::value_t h)
194 {
195     return VEC(a, b, c, d, e, f, g, h);
196 }
197 
198 
199 template<>
200 inline vbool4
mkvec(bool a,bool b,bool c,bool d,bool,bool,bool,bool)201 mkvec<vbool4>(bool a, bool b, bool c, bool d, bool, bool, bool, bool)
202 {
203     return vbool4(a, b, c, d);
204 }
205 
206 template<>
207 inline vint4
mkvec(int a,int b,int c,int d,int,int,int,int)208 mkvec<vint4>(int a, int b, int c, int d, int, int, int, int)
209 {
210     return vint4(a, b, c, d);
211 }
212 
213 template<>
214 inline vint16
mkvec(int a,int b,int c,int d,int e,int f,int g,int h)215 mkvec<vint16>(int a, int b, int c, int d, int e, int f, int g, int h)
216 {
217     return vint16(a, b, c, d, e, f, g, h, h + 1, h + 2, h + 3, h + 4, h + 5,
218                   h + 6, h + 7, h + 8);
219 }
220 
221 template<>
222 inline vfloat4
mkvec(float a,float b,float c,float d,float,float,float,float)223 mkvec<vfloat4>(float a, float b, float c, float d, float, float, float, float)
224 {
225     return vfloat4(a, b, c, d);
226 }
227 
228 template<>
229 inline vfloat3
mkvec(float a,float b,float c,float,float,float,float,float)230 mkvec<vfloat3>(float a, float b, float c, float, float, float, float, float)
231 {
232     return vfloat3(a, b, c);
233 }
234 
235 template<>
236 inline vfloat16
mkvec(float a,float b,float c,float d,float e,float f,float g,float h)237 mkvec<vfloat16>(float a, float b, float c, float d, float e, float f, float g,
238                 float h)
239 {
240     return vfloat16(a, b, c, d, e, f, g, h, h + 1, h + 2, h + 3, h + 4, h + 5,
241                     h + 6, h + 7, h + 8);
242 }
243 
244 
245 
246 template<typename VEC>
247 inline int
loadstore_vec(int)248 loadstore_vec(int /*dummy*/)
249 {
250     typedef typename VEC::value_t ELEM;
251     ELEM B[VEC::elements];
252     VEC v;
253     v.load((ELEM*)dummy_float);
254     DoNotOptimize(v);
255     clobber_all_memory();
256     v.store((ELEM*)B);
257     DoNotOptimize(B[0]);
258     return 0;
259 }
260 
261 template<typename VEC>
262 inline VEC
load_vec(int)263 load_vec(int /*dummy*/)
264 {
265     typedef typename VEC::value_t ELEM;
266     VEC v;
267     v.load((ELEM*)dummy_float);
268     return v;
269 }
270 
271 template<typename VEC>
272 inline int
store_vec(const VEC & v)273 store_vec(const VEC& v)
274 {
275     typedef typename VEC::value_t ELEM;
276     v.store((ELEM*)dummy_float);
277     return 0;
278 }
279 
280 template<typename VEC>
281 inline VEC
load_scalar(int)282 load_scalar(int /*dummy*/)
283 {
284     typedef typename VEC::value_t ELEM;
285     VEC v;
286 OIIO_PRAGMA_WARNING_PUSH
287 OIIO_GCC_ONLY_PRAGMA(GCC diagnostic ignored "-Wstrict-aliasing")
288     v.load(*(ELEM*)dummy_float);
289 OIIO_PRAGMA_WARNING_POP
290     return v;
291 }
292 
293 template<typename VEC, int N>
294 inline VEC
load_vec_N(typename VEC::value_t *)295 load_vec_N(typename VEC::value_t* /*B*/)
296 {
297     typedef typename VEC::value_t ELEM;
298     VEC v;
299     v.load((ELEM*)dummy_float, N);
300     return v;
301 }
302 
303 template<typename VEC, int N>
304 inline int
store_vec_N(const VEC & v)305 store_vec_N(const VEC& v)
306 {
307     typedef typename VEC::value_t ELEM;
308     v.store((ELEM*)dummy_float, N);
309     DoNotOptimize(dummy_float[0]);
310     return 0;
311 }
312 
313 
314 
315 inline float
dot_imath(const Imath::V3f & v)316 dot_imath(const Imath::V3f& v)
317 {
318     return v.dot(v);
319 }
320 inline float
dot_imath_simd(const Imath::V3f & v_)321 dot_imath_simd(const Imath::V3f& v_)
322 {
323     vfloat3 v(v_);
324     return simd::dot(v, v);
325 }
326 inline float
dot_simd(const simd::vfloat3 & v)327 dot_simd(const simd::vfloat3& v)
328 {
329     return dot(v, v);
330 }
331 
332 inline Imath::V3f
norm_imath(const Imath::V3f & a)333 norm_imath(const Imath::V3f& a)
334 {
335     return a.normalized();
336 }
337 
338 inline Imath::V3f
norm_imath_simd(const vfloat3 & a)339 norm_imath_simd(const vfloat3& a)
340 {
341     return a.normalized().V3f();
342 }
343 
344 inline Imath::V3f
norm_imath_simd_fast(const vfloat3 & a)345 norm_imath_simd_fast(const vfloat3& a)
346 {
347     return a.normalized_fast().V3f();
348 }
349 
350 inline vfloat3
norm_simd_fast(const vfloat3 & a)351 norm_simd_fast(const vfloat3& a)
352 {
353     return a.normalized_fast();
354 }
355 
356 inline vfloat3
norm_simd(const vfloat3 & a)357 norm_simd(const vfloat3& a)
358 {
359     return a.normalized();
360 }
361 
362 
363 inline Imath::M44f
inverse_imath(const Imath::M44f & M)364 inverse_imath(const Imath::M44f& M)
365 {
366     return M.inverse();
367 }
368 
369 
370 inline matrix44
inverse_simd(const matrix44 & M)371 inverse_simd(const matrix44& M)
372 {
373     return M.inverse();
374 }
375 
376 
377 
378 template<typename VEC>
379 void
test_loadstore()380 test_loadstore()
381 {
382     typedef typename VEC::value_t ELEM;
383     test_heading("load/store ", VEC::type_name());
384     OIIO_SIMD16_ALIGN ELEM oneval[]
385         = { 101, 101, 101, 101, 101, 101, 101, 101,
386             101, 101, 101, 101, 101, 101, 101, 101 };
387     OIIO_CHECK_SIMD_EQUAL(VEC(oneval), VEC(oneval[0]));
388     { VEC a = oneval[0]; OIIO_CHECK_SIMD_EQUAL(VEC(oneval), a); }
389     OIIO_SIMD16_ALIGN VEC C1234 = VEC::Iota(1);
390     OIIO_SIMD16_ALIGN ELEM partial[]
391         = { 101, 102, 103, 104, 105, 106, 107, 108,
392             109, 110, 111, 112, 113, 114, 115, 116 };
393     OIIO_CHECK_SIMD_EQUAL(VEC(partial), VEC::Iota(101));
394     for (int i = 1; i <= VEC::elements; ++i) {
395         VEC a(ELEM(0));
396         a.load(partial, i);
397         for (int j = 0; j < VEC::elements; ++j)
398             OIIO_CHECK_EQUAL(a[j], j < i ? partial[j] : ELEM(0));
399         std::cout << "  partial load " << i << " : " << a << "\n";
400         ELEM stored[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
401         C1234.store(stored, i);
402         for (int j = 0; j < VEC::elements; ++j)
403             OIIO_CHECK_EQUAL(stored[j], j < i ? ELEM(j + 1) : ELEM(0));
404         std::cout << "  partial store " << i << " :";
405         for (int c = 0; c < VEC::elements; ++c)
406             std::cout << ' ' << stored[c];
407         std::cout << std::endl;
408     }
409 
410     benchmark("load scalar", load_scalar<VEC>, 0, VEC::elements);
411     benchmark("load vec", load_vec<VEC>, 0, VEC::elements);
412     benchmark("store vec", store_vec<VEC>, 0, VEC::elements);
413     OIIO_SIMD16_ALIGN ELEM tmp[VEC::elements];
414     if (VEC::elements == 16) {
415         benchmark("load 16 comps", load_vec_N<VEC, 16>, tmp, 16);
416         benchmark("load 13 comps", load_vec_N<VEC, 13>, tmp, 13);
417         benchmark("load 9 comps", load_vec_N<VEC, 9>, tmp, 9);
418     }
419     if (VEC::elements > 4) {
420         benchmark("load 8 comps", load_vec_N<VEC, 8>, tmp, 8);
421         benchmark("load 7 comps", load_vec_N<VEC, 7>, tmp, 7);
422         benchmark("load 6 comps", load_vec_N<VEC, 6>, tmp, 6);
423         benchmark("load 5 comps", load_vec_N<VEC, 5>, tmp, 5);
424     }
425     if (VEC::elements >= 4) {
426         benchmark("load 4 comps", load_vec_N<VEC, 4>, tmp, 4);
427     }
428     benchmark("load 3 comps", load_vec_N<VEC, 3>, tmp, 3);
429     benchmark("load 2 comps", load_vec_N<VEC, 2>, tmp, 2);
430     benchmark("load 1 comps", load_vec_N<VEC, 1>, tmp, 1);
431 
432     if (VEC::elements == 16) {
433         benchmark("store 16 comps", store_vec_N<VEC, 16>, C1234, 16);
434         benchmark("store 13 comps", store_vec_N<VEC, 13>, C1234, 13);
435         benchmark("store 9 comps", store_vec_N<VEC, 9>, C1234, 9);
436     }
437     if (VEC::elements > 4) {
438         benchmark("store 8 comps", store_vec_N<VEC, 8>, C1234, 8);
439         benchmark("store 7 comps", store_vec_N<VEC, 7>, C1234, 7);
440         benchmark("store 6 comps", store_vec_N<VEC, 6>, C1234, 6);
441         benchmark("store 5 comps", store_vec_N<VEC, 5>, C1234, 5);
442     }
443     if (VEC::elements >= 4) {
444         benchmark("store 4 comps", store_vec_N<VEC, 4>, C1234, 4);
445     }
446     benchmark("store 3 comps", store_vec_N<VEC, 3>, C1234, 3);
447     benchmark("store 2 comps", store_vec_N<VEC, 2>, C1234, 2);
448     benchmark("store 1 comps", store_vec_N<VEC, 1>, C1234, 1);
449 }
450 
451 
452 
453 template<typename VEC>
454 void
test_conversion_loadstore_float()455 test_conversion_loadstore_float()
456 {
457     typedef typename VEC::value_t ELEM;
458     test_heading("load/store with conversion", VEC::type_name());
459     VEC C1234      = VEC::Iota(1);
460     ELEM partial[] = { 101, 102, 103, 104, 105, 106, 107, 108,
461                        109, 110, 111, 112, 113, 114, 115, 116 };
462     OIIO_CHECK_SIMD_EQUAL(VEC(partial), VEC::Iota(101));
463 
464     // Check load from integers
465     unsigned short us1234[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
466     short s1234[]           = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
467     unsigned char uc1234[]  = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
468     char c1234[]            = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
469     half h1234[]            = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
470     OIIO_CHECK_SIMD_EQUAL (VEC(us1234), C1234);
471     OIIO_CHECK_SIMD_EQUAL (VEC( s1234), C1234);
472     OIIO_CHECK_SIMD_EQUAL (VEC(uc1234), C1234);
473     OIIO_CHECK_SIMD_EQUAL (VEC( c1234), C1234);
474 
475     benchmark ("load from unsigned short[]", [](const unsigned short *d){ return VEC(d); }, us1234);
476     benchmark ("load from short[]", [](const short *d){ return VEC(d); }, s1234);
477     benchmark ("load from unsigned char[]", [](const unsigned char *d){ return VEC(d); }, uc1234);
478     benchmark ("load from char[]", [](const char *d){ return VEC(d); }, c1234);
479     benchmark ("load from half[]", [](const half *d){ return VEC(d); }, h1234);
480 
481     benchmark ("store to half[]", [=](half *d){ C1234.store(d); return 0; }, h1234, VEC::elements);
482 }
483 
484 
485 
486 template<typename VEC>
test_conversion_loadstore_int()487 void test_conversion_loadstore_int ()
488 {
489     typedef typename VEC::value_t ELEM;
490     test_heading ("load/store with conversion", VEC::type_name());
491     VEC C1234 = VEC::Iota(1);
492     ELEM partial[] = { 101, 102, 103, 104, 105, 106, 107, 108,
493                        109, 110, 111, 112, 113, 114, 115, 116 };
494     OIIO_CHECK_SIMD_EQUAL (VEC(partial), VEC::Iota(101));
495 
496     // Check load from integers
497     int i1234[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
498     unsigned short us1234[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
499     short s1234[]           = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
500     unsigned char uc1234[]  = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
501     char c1234[]            = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
502     OIIO_CHECK_SIMD_EQUAL (VEC( i1234), C1234);
503     OIIO_CHECK_SIMD_EQUAL (VEC(us1234), C1234);
504     OIIO_CHECK_SIMD_EQUAL (VEC( s1234), C1234);
505     OIIO_CHECK_SIMD_EQUAL (VEC(uc1234), C1234);
506     OIIO_CHECK_SIMD_EQUAL (VEC( c1234), C1234);
507 
508     benchmark ("load from int[]", [](const int *d){ return VEC(d); }, i1234);
509     benchmark ("load from unsigned short[]", [](const unsigned short *d){ return VEC(d); }, us1234);
510     benchmark ("load from short[]", [](const short *d){ return VEC(d); }, s1234);
511     benchmark ("load from unsigned char[]", [](const unsigned char *d){ return VEC(d); }, uc1234);
512     benchmark ("load from char[]", [](const char *d){ return VEC(d); }, c1234);
513 
514     benchmark ("store to unsigned short[]", [=](unsigned short *d){ C1234.store(d); return 0; }, us1234, VEC::elements);
515     benchmark ("store to unsigned char[]", [=](unsigned char *d){ C1234.store(d); return 0; }, uc1234, VEC::elements);
516 }
517 
518 
519 
520 template<typename VEC>
test_vint_to_uint16s()521 void test_vint_to_uint16s ()
522 {
523     test_heading (Strutil::sprintf("test converting %s to uint16", VEC::type_name()));
524     VEC ival = VEC::Iota (0xffff0000);
525     unsigned short buf[VEC::elements];
526     ival.store (buf);
527     for (int i = 0; i < VEC::elements; ++i)
528         OIIO_CHECK_EQUAL (int(buf[i]), i);
529 
530     benchmark2 ("load from uint16", [](VEC& a, unsigned short *s){ a.load(s); return 1; }, ival, buf, VEC::elements);
531     benchmark2 ("convert to uint16", [](const VEC& a, unsigned short *s){ a.store(s); return 1; }, ival, buf, VEC::elements);
532 }
533 
534 
535 
536 template<typename VEC>
test_vint_to_uint8s()537 void test_vint_to_uint8s ()
538 {
539     test_heading (Strutil::sprintf("test converting %s to uint8", VEC::type_name()));
540     VEC ival = VEC::Iota (0xffffff00);
541     unsigned char buf[VEC::elements];
542     ival.store (buf);
543     for (int i = 0; i < VEC::elements; ++i)
544         OIIO_CHECK_EQUAL (int(buf[i]), i);
545 
546     benchmark2 ("load from uint8", [](VEC& a, unsigned char *s){ a.load(s); return 1; }, ival, buf, VEC::elements);
547     benchmark2 ("convert to uint16", [](const VEC& a, unsigned char *s){ a.store(s); return 1; }, ival, buf, VEC::elements);
548 }
549 
550 
551 
552 template<typename VEC>
test_masked_loadstore()553 void test_masked_loadstore ()
554 {
555     typedef typename VEC::value_t ELEM;
556     typedef typename VEC::vbool_t BOOL;
557     test_heading ("masked loadstore ", VEC::type_name());
558     ELEM iota[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
559     BOOL mask1 = mkvec<BOOL> (true, false, true, false);
560     BOOL mask2 = mkvec<BOOL> (true, true, false,false);
561 
562     VEC v;
563     v = -1;
564     v.load_mask (mask1, iota);
565     ELEM r1[] = { 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15, 0 };
566     OIIO_CHECK_SIMD_EQUAL (v, VEC(r1));
567     ELEM buf[] = { -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2 };
568     v.store_mask (mask2, buf);
569     ELEM r2[] = { 1, 0, -2, -2, 5, 0, -2, -2, 9, 0, -2, -2, 13, 0, -2, -2 };
570     OIIO_CHECK_SIMD_EQUAL (VEC(buf), VEC(r2));
571 
572     benchmark ("masked load with int mask", [](const ELEM *d){ VEC v; v.load_mask (0xffff, d); return v; }, iota);
573     benchmark ("masked load with bool mask", [](const ELEM *d){ VEC v; v.load_mask (BOOL::True(), d); return v; }, iota);
574     benchmark ("masked store with int mask", [&](ELEM *d){ v.store_mask (0xffff, d); return 0; }, r2);
575     benchmark ("masked store with bool mask", [&](ELEM *d){ v.store_mask (BOOL::True(), d); return 0; }, r2);
576 }
577 
578 
579 
580 template<typename VEC>
581 void
test_gatherscatter()582 test_gatherscatter()
583 {
584     typedef typename VEC::value_t ELEM;
585     typedef typename VEC::vbool_t BOOL;
586     test_heading("scatter & gather ", VEC::type_name());
587 
588     const int spacing = 3;
589     const int bufsize = VEC::elements * 3 + 1;
590     std::vector<ELEM> gather_source(bufsize);
591     for (int i = 0; i < bufsize; ++i)
592         gather_source[i] = ((i % spacing) == 1) ? i / 3 : -1;
593     // gather_source will contain: -1 0 -1  -1 1 -1  -1 2 -1  -1 3 -1  ...
594 
595     auto indices = VEC::vint_t::Iota(1, 3);
596     VEC g, gm;
597     g.gather(gather_source.data(), indices);
598     OIIO_CHECK_SIMD_EQUAL(g, VEC::Iota());
599 
600     BOOL mask = BOOL::from_bitmask(0x55555555);  // every other one
601     ELEM every_other_iota[] = { 0, 0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0 };
602     gm = 0;
603     gm.gather_mask (mask, gather_source.data(), indices);
604     OIIO_CHECK_SIMD_EQUAL (gm, VEC(every_other_iota));
605 
606     std::vector<ELEM> scatter_out (bufsize, (ELEM)-1);
607     g.scatter (scatter_out.data(), indices);
608     OIIO_CHECK_ASSERT (scatter_out == gather_source);
609 
610     std::fill (scatter_out.begin(), scatter_out.end(), -1);
611     VEC::Iota().scatter_mask (mask, scatter_out.data(), indices);
612     for (int i = 0; i < (int)scatter_out.size(); ++i)
613         OIIO_CHECK_EQUAL (scatter_out[i], ((i%3) == 1 && (i&1) ? i/3 : -1));
614 
615     benchmark ("gather", [&](const ELEM *d){ VEC v; v.gather (d, indices); return v; }, gather_source.data());
616     benchmark ("gather_mask", [&](const ELEM *d){ VEC v; v.gather_mask (mask, d, indices); return v; }, gather_source.data());
617     benchmark ("scatter", [&](ELEM *d){ g.scatter (d, indices); return g; }, scatter_out.data());
618     benchmark ("scatter_mask", [&](ELEM *d){ g.scatter_mask (mask, d, indices); return g; }, scatter_out.data());
619 }
620 
621 
622 
623 template<typename T>
test_extract3()624 void test_extract3 ()
625 {
626     const T vals[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
627     using VEC = typename VecType<T,3>::type;
628     VEC b (vals);
629     for (int i = 0; i < VEC::elements; ++i)
630         OIIO_CHECK_EQUAL (b[i], vals[i]);
631     OIIO_CHECK_EQUAL (extract<0>(b), 0);
632     OIIO_CHECK_EQUAL (extract<1>(b), 1);
633     OIIO_CHECK_EQUAL (extract<2>(b), 2);
634 }
635 
636 template<typename T>
637 void
test_extract4()638 test_extract4()
639 {
640     const T vals[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
641     using VEC      = typename VecType<T, 4>::type;
642     VEC b(vals);
643     for (int i = 0; i < VEC::elements; ++i)
644         OIIO_CHECK_EQUAL(b[i], vals[i]);
645     OIIO_CHECK_EQUAL(extract<0>(b), 0);
646     OIIO_CHECK_EQUAL(extract<1>(b), 1);
647     OIIO_CHECK_EQUAL(extract<2>(b), 2);
648     OIIO_CHECK_EQUAL(extract<3>(b), 3);
649 }
650 
651 template<typename T>
652 void
test_extract8()653 test_extract8()
654 {
655     test_extract4<T>();
656 
657     const T vals[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
658     using VEC      = typename VecType<T, 8>::type;
659     VEC b(vals);
660     for (int i = 0; i < VEC::elements; ++i)
661         OIIO_CHECK_EQUAL(b[i], vals[i]);
662     OIIO_CHECK_EQUAL(extract<4>(b), 4);
663     OIIO_CHECK_EQUAL(extract<5>(b), 5);
664     OIIO_CHECK_EQUAL(extract<6>(b), 6);
665     OIIO_CHECK_EQUAL(extract<7>(b), 7);
666 }
667 
668 template<typename T>
669 void
test_extract16()670 test_extract16()
671 {
672     test_extract8<T>();
673 
674     const T vals[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
675     using VEC      = typename VecType<T, 16>::type;
676     VEC b(vals);
677     for (int i = 0; i < VEC::elements; ++i)
678         OIIO_CHECK_EQUAL(b[i], vals[i]);
679     OIIO_CHECK_EQUAL(extract<8>(b), 8);
680     OIIO_CHECK_EQUAL(extract<9>(b), 9);
681     OIIO_CHECK_EQUAL(extract<10>(b), 10);
682     OIIO_CHECK_EQUAL(extract<11>(b), 11);
683     OIIO_CHECK_EQUAL(extract<12>(b), 12);
684     OIIO_CHECK_EQUAL(extract<13>(b), 13);
685     OIIO_CHECK_EQUAL(extract<14>(b), 14);
686     OIIO_CHECK_EQUAL(extract<15>(b), 15);
687 }
688 
689 
690 
691 template<typename T, int SIZE> void test_extract ();
test_extract()692 template<> void test_extract<float,16> () { test_extract16<float>(); }
test_extract()693 template<> void test_extract<int,16> () { test_extract16<int>(); }
test_extract()694 template<> void test_extract<float,8> () { test_extract8<float>(); }
test_extract()695 template<> void test_extract<int,8> () { test_extract8<int>(); }
test_extract()696 template<> void test_extract<float,4> () { test_extract4<float>(); }
test_extract()697 template<> void test_extract<int,4> () { test_extract4<int>(); }
test_extract()698 template<> void test_extract<float,3> () { test_extract3<float>(); }
699 
700 
701 
702 template<typename VEC>
703 void
test_component_access()704 test_component_access()
705 {
706     typedef typename VEC::value_t ELEM;
707     test_heading("component_access ", VEC::type_name());
708 
709     const ELEM vals[]
710         = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
711     VEC a = VEC::Iota();
712     for (int i = 0; i < VEC::elements; ++i)
713         OIIO_CHECK_EQUAL(a[i], vals[i]);
714 
715     if (VEC::elements <= 4) {
716         OIIO_CHECK_EQUAL(a.x(), 0);
717         OIIO_CHECK_EQUAL(a.y(), 1);
718         OIIO_CHECK_EQUAL(a.z(), 2);
719         if (SimdElements<VEC>::size > 3)
720             OIIO_CHECK_EQUAL(a.w(), 3);
721         VEC t;
722         t = a;
723         t.set_x(42);
724         OIIO_CHECK_SIMD_EQUAL(t, mkvec<VEC>(42, 1, 2, 3, 4, 5, 6, 7));
725         t = a;
726         t.set_y(42);
727         OIIO_CHECK_SIMD_EQUAL(t, mkvec<VEC>(0, 42, 2, 3, 4, 5, 6, 7));
728         t = a;
729         t.set_z(42);
730         OIIO_CHECK_SIMD_EQUAL(t, mkvec<VEC>(0, 1, 42, 3, 4, 5, 6, 7));
731         if (SimdElements<VEC>::size > 3) {
732             t = a;
733             t.set_w(42);
734             OIIO_CHECK_SIMD_EQUAL(t, mkvec<VEC>(0, 1, 2, 42, 4, 5, 6, 7));
735         }
736     }
737 
738     OIIO_CHECK_EQUAL(extract<0>(a), 0);
739     OIIO_CHECK_EQUAL(extract<1>(a), 1);
740     OIIO_CHECK_EQUAL(extract<2>(a), 2);
741     if (SimdElements<VEC>::size > 3)
742         OIIO_CHECK_EQUAL (extract<3>(a), 3);
743     OIIO_CHECK_SIMD_EQUAL (insert<0>(a, ELEM(42)), mkvec<VEC>(42,1,2,3,4,5,6,7));
744     OIIO_CHECK_SIMD_EQUAL (insert<1>(a, ELEM(42)), mkvec<VEC>(0,42,2,3,4,5,6,7));
745     OIIO_CHECK_SIMD_EQUAL (insert<2>(a, ELEM(42)), mkvec<VEC>(0,1,42,3,4,5,6,7));
746     if (SimdElements<VEC>::size > 3)
747         OIIO_CHECK_SIMD_EQUAL (insert<3>(a, ELEM(42)), mkvec<VEC>(0,1,2,42,4,5,6,7));
748 
749     VEC b(vals);
750 #if 1
751     test_extract<ELEM, VEC::elements>();
752 #else
753     for (int i = 0; i < VEC::elements; ++i)
754         OIIO_CHECK_EQUAL(b[i], vals[i]);
755     OIIO_CHECK_EQUAL(extract<0>(b), 0);
756     OIIO_CHECK_EQUAL(extract<1>(b), 1);
757     OIIO_CHECK_EQUAL(extract<2>(b), 2);
758     if (SimdElements<VEC>::size > 3)
759         OIIO_CHECK_EQUAL(extract<3>(b), 3);
760     if (SimdElements<VEC>::size > 4) {
761         OIIO_CHECK_EQUAL(extract<4>(b), 4);
762         OIIO_CHECK_EQUAL(extract<5>(b), 5);
763         OIIO_CHECK_EQUAL(extract<6>(b), 6);
764         OIIO_CHECK_EQUAL(extract<7>(b), 7);
765     }
766     if (SimdElements<VEC>::size > 8) {
767         OIIO_CHECK_EQUAL(extract<8>(b), 8);
768         OIIO_CHECK_EQUAL(extract<9>(b), 9);
769         OIIO_CHECK_EQUAL(extract<10>(b), 10);
770         OIIO_CHECK_EQUAL(extract<11>(b), 11);
771         OIIO_CHECK_EQUAL(extract<12>(b), 12);
772         OIIO_CHECK_EQUAL(extract<13>(b), 13);
773         OIIO_CHECK_EQUAL(extract<14>(b), 14);
774         OIIO_CHECK_EQUAL(extract<15>(b), 15);
775     }
776 #endif
777 
778     benchmark2 ("operator[i]", [&](const VEC& v, int i){ return v[i]; },  b, 2, 1 /*work*/);
779     benchmark2 ("operator[2]", [&](const VEC& v, int /*i*/){ return v[2]; },  b, 2, 1 /*work*/);
780     benchmark2 ("operator[0]", [&](const VEC& v, int /*i*/){ return v[0]; },  b, 0, 1 /*work*/);
781     benchmark2 ("extract<2> ", [&](const VEC& v, int /*i*/){ return extract<2>(v); },  b, 2, 1 /*work*/);
782     benchmark2 ("extract<0> ", [&](const VEC& v, int /*i*/){ return extract<0>(v); },  b, 0, 1 /*work*/);
783     benchmark2 ("insert<2> ", [&](const VEC& v, ELEM i){ return insert<2>(v, i); }, b, ELEM(1), 1 /*work*/);
784 }
785 
786 
787 
788 template<>
789 void
test_component_access()790 test_component_access<vbool4>()
791 {
792     typedef vbool4 VEC;
793     typedef VEC::value_t ELEM;
794     test_heading("component_access ", VEC::type_name());
795 
796     for (int bit = 0; bit < VEC::elements; ++bit) {
797         VEC ctr(bit == 0, bit == 1, bit == 2, bit == 3);
798         VEC a;
799         a.clear();
800         for (int b = 0; b < VEC::elements; ++b)
801             a.setcomp(b, b == bit);
802         OIIO_CHECK_SIMD_EQUAL(ctr, a);
803         for (int b = 0; b < VEC::elements; ++b)
804             OIIO_CHECK_EQUAL(bool(a[b]), b == bit);
805         OIIO_CHECK_EQUAL(extract<0>(a), bit == 0);
806         OIIO_CHECK_EQUAL(extract<1>(a), bit == 1);
807         OIIO_CHECK_EQUAL(extract<2>(a), bit == 2);
808         OIIO_CHECK_EQUAL(extract<3>(a), bit == 3);
809     }
810 
811     VEC a;
812     a.load(0, 0, 0, 0);
813     OIIO_CHECK_SIMD_EQUAL(insert<0>(a, 1), VEC(1, 0, 0, 0));
814     OIIO_CHECK_SIMD_EQUAL(insert<1>(a, 1), VEC(0, 1, 0, 0));
815     OIIO_CHECK_SIMD_EQUAL(insert<2>(a, 1), VEC(0, 0, 1, 0));
816     OIIO_CHECK_SIMD_EQUAL(insert<3>(a, 1), VEC(0, 0, 0, 1));
817     a.load(1, 1, 1, 1);
818     OIIO_CHECK_SIMD_EQUAL(insert<0>(a, 0), VEC(0, 1, 1, 1));
819     OIIO_CHECK_SIMD_EQUAL(insert<1>(a, 0), VEC(1, 0, 1, 1));
820     OIIO_CHECK_SIMD_EQUAL(insert<2>(a, 0), VEC(1, 1, 0, 1));
821     OIIO_CHECK_SIMD_EQUAL(insert<3>(a, 0), VEC(1, 1, 1, 0));
822 }
823 
824 
825 
826 template<>
827 void
test_component_access()828 test_component_access<vbool8>()
829 {
830     typedef vbool8 VEC;
831     typedef VEC::value_t ELEM;
832     test_heading("component_access ", VEC::type_name());
833 
834     for (int bit = 0; bit < VEC::elements; ++bit) {
835         VEC ctr(bit == 0, bit == 1, bit == 2, bit == 3, bit == 4, bit == 5,
836                 bit == 6, bit == 7);
837         VEC a;
838         a.clear();
839         for (int b = 0; b < VEC::elements; ++b)
840             a.setcomp(b, b == bit);
841         OIIO_CHECK_SIMD_EQUAL(ctr, a);
842         for (int b = 0; b < VEC::elements; ++b)
843             OIIO_CHECK_EQUAL(bool(a[b]), b == bit);
844         OIIO_CHECK_EQUAL(extract<0>(a), bit == 0);
845         OIIO_CHECK_EQUAL(extract<1>(a), bit == 1);
846         OIIO_CHECK_EQUAL(extract<2>(a), bit == 2);
847         OIIO_CHECK_EQUAL(extract<3>(a), bit == 3);
848         OIIO_CHECK_EQUAL(extract<4>(a), bit == 4);
849         OIIO_CHECK_EQUAL(extract<5>(a), bit == 5);
850         OIIO_CHECK_EQUAL(extract<6>(a), bit == 6);
851         OIIO_CHECK_EQUAL(extract<7>(a), bit == 7);
852     }
853 
854     VEC a;
855     a.load(0, 0, 0, 0, 0, 0, 0, 0);
856     OIIO_CHECK_SIMD_EQUAL(insert<0>(a, 1), VEC(1, 0, 0, 0, 0, 0, 0, 0));
857     OIIO_CHECK_SIMD_EQUAL(insert<1>(a, 1), VEC(0, 1, 0, 0, 0, 0, 0, 0));
858     OIIO_CHECK_SIMD_EQUAL(insert<2>(a, 1), VEC(0, 0, 1, 0, 0, 0, 0, 0));
859     OIIO_CHECK_SIMD_EQUAL(insert<3>(a, 1), VEC(0, 0, 0, 1, 0, 0, 0, 0));
860     OIIO_CHECK_SIMD_EQUAL(insert<4>(a, 1), VEC(0, 0, 0, 0, 1, 0, 0, 0));
861     OIIO_CHECK_SIMD_EQUAL(insert<5>(a, 1), VEC(0, 0, 0, 0, 0, 1, 0, 0));
862     OIIO_CHECK_SIMD_EQUAL(insert<6>(a, 1), VEC(0, 0, 0, 0, 0, 0, 1, 0));
863     OIIO_CHECK_SIMD_EQUAL(insert<7>(a, 1), VEC(0, 0, 0, 0, 0, 0, 0, 1));
864     a.load(1, 1, 1, 1, 1, 1, 1, 1);
865     OIIO_CHECK_SIMD_EQUAL(insert<0>(a, 0), VEC(0, 1, 1, 1, 1, 1, 1, 1));
866     OIIO_CHECK_SIMD_EQUAL(insert<1>(a, 0), VEC(1, 0, 1, 1, 1, 1, 1, 1));
867     OIIO_CHECK_SIMD_EQUAL(insert<2>(a, 0), VEC(1, 1, 0, 1, 1, 1, 1, 1));
868     OIIO_CHECK_SIMD_EQUAL(insert<3>(a, 0), VEC(1, 1, 1, 0, 1, 1, 1, 1));
869     OIIO_CHECK_SIMD_EQUAL(insert<4>(a, 0), VEC(1, 1, 1, 1, 0, 1, 1, 1));
870     OIIO_CHECK_SIMD_EQUAL(insert<5>(a, 0), VEC(1, 1, 1, 1, 1, 0, 1, 1));
871     OIIO_CHECK_SIMD_EQUAL(insert<6>(a, 0), VEC(1, 1, 1, 1, 1, 1, 0, 1));
872     OIIO_CHECK_SIMD_EQUAL(insert<7>(a, 0), VEC(1, 1, 1, 1, 1, 1, 1, 0));
873 }
874 
875 
876 
877 template<>
878 void
test_component_access()879 test_component_access<vbool16>()
880 {
881     typedef vbool16 VEC;
882     typedef VEC::value_t ELEM;
883     test_heading("component_access ", VEC::type_name());
884 
885     for (int bit = 0; bit < VEC::elements; ++bit) {
886         VEC ctr(bit == 0, bit == 1, bit == 2, bit == 3, bit == 4, bit == 5,
887                 bit == 6, bit == 7, bit == 8, bit == 9, bit == 10, bit == 11,
888                 bit == 12, bit == 13, bit == 14, bit == 15);
889         VEC a;
890         a.clear();
891         for (int b = 0; b < VEC::elements; ++b)
892             a.setcomp(b, b == bit);
893         OIIO_CHECK_SIMD_EQUAL(ctr, a);
894         for (int b = 0; b < VEC::elements; ++b)
895             OIIO_CHECK_EQUAL(bool(a[b]), b == bit);
896         OIIO_CHECK_EQUAL(extract<0>(a), bit == 0);
897         OIIO_CHECK_EQUAL(extract<1>(a), bit == 1);
898         OIIO_CHECK_EQUAL(extract<2>(a), bit == 2);
899         OIIO_CHECK_EQUAL(extract<3>(a), bit == 3);
900         OIIO_CHECK_EQUAL(extract<4>(a), bit == 4);
901         OIIO_CHECK_EQUAL(extract<5>(a), bit == 5);
902         OIIO_CHECK_EQUAL(extract<6>(a), bit == 6);
903         OIIO_CHECK_EQUAL(extract<7>(a), bit == 7);
904         OIIO_CHECK_EQUAL(extract<8>(a), bit == 8);
905         OIIO_CHECK_EQUAL(extract<9>(a), bit == 9);
906         OIIO_CHECK_EQUAL(extract<10>(a), bit == 10);
907         OIIO_CHECK_EQUAL(extract<11>(a), bit == 11);
908         OIIO_CHECK_EQUAL(extract<12>(a), bit == 12);
909         OIIO_CHECK_EQUAL(extract<13>(a), bit == 13);
910         OIIO_CHECK_EQUAL(extract<14>(a), bit == 14);
911         OIIO_CHECK_EQUAL(extract<15>(a), bit == 15);
912     }
913 
914     VEC a;
915     a.load (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
916     OIIO_CHECK_SIMD_EQUAL (insert<0> (a, 1), VEC(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));
917     OIIO_CHECK_SIMD_EQUAL (insert<1> (a, 1), VEC(0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0));
918     OIIO_CHECK_SIMD_EQUAL (insert<2> (a, 1), VEC(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0));
919     OIIO_CHECK_SIMD_EQUAL (insert<3> (a, 1), VEC(0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0));
920     OIIO_CHECK_SIMD_EQUAL (insert<4> (a, 1), VEC(0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0));
921     OIIO_CHECK_SIMD_EQUAL (insert<5> (a, 1), VEC(0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0));
922     OIIO_CHECK_SIMD_EQUAL (insert<6> (a, 1), VEC(0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0));
923     OIIO_CHECK_SIMD_EQUAL (insert<7> (a, 1), VEC(0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0));
924     OIIO_CHECK_SIMD_EQUAL (insert<8> (a, 1), VEC(0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0));
925     OIIO_CHECK_SIMD_EQUAL (insert<9> (a, 1), VEC(0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0));
926     OIIO_CHECK_SIMD_EQUAL (insert<10>(a, 1), VEC(0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0));
927     OIIO_CHECK_SIMD_EQUAL (insert<11>(a, 1), VEC(0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0));
928     OIIO_CHECK_SIMD_EQUAL (insert<12>(a, 1), VEC(0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0));
929     OIIO_CHECK_SIMD_EQUAL (insert<13>(a, 1), VEC(0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0));
930     OIIO_CHECK_SIMD_EQUAL (insert<14>(a, 1), VEC(0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0));
931     OIIO_CHECK_SIMD_EQUAL (insert<15>(a, 1), VEC(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1));
932     a.load (1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1);
933     OIIO_CHECK_SIMD_EQUAL (insert<0> (a, 0), VEC(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1));
934     OIIO_CHECK_SIMD_EQUAL (insert<1> (a, 0), VEC(1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1));
935     OIIO_CHECK_SIMD_EQUAL (insert<2> (a, 0), VEC(1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1));
936     OIIO_CHECK_SIMD_EQUAL (insert<3> (a, 0), VEC(1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1));
937     OIIO_CHECK_SIMD_EQUAL (insert<4> (a, 0), VEC(1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1));
938     OIIO_CHECK_SIMD_EQUAL (insert<5> (a, 0), VEC(1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1));
939     OIIO_CHECK_SIMD_EQUAL (insert<6> (a, 0), VEC(1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1));
940     OIIO_CHECK_SIMD_EQUAL (insert<7> (a, 0), VEC(1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1));
941     OIIO_CHECK_SIMD_EQUAL (insert<8> (a, 0), VEC(1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1));
942     OIIO_CHECK_SIMD_EQUAL (insert<9> (a, 0), VEC(1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1));
943     OIIO_CHECK_SIMD_EQUAL (insert<10>(a, 0), VEC(1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1));
944     OIIO_CHECK_SIMD_EQUAL (insert<11>(a, 0), VEC(1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1));
945     OIIO_CHECK_SIMD_EQUAL (insert<12>(a, 0), VEC(1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1));
946     OIIO_CHECK_SIMD_EQUAL (insert<13>(a, 0), VEC(1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1));
947     OIIO_CHECK_SIMD_EQUAL (insert<14>(a, 0), VEC(1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1));
948     OIIO_CHECK_SIMD_EQUAL (insert<15>(a, 0), VEC(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0));
949 }
950 
951 
952 
do_neg(const T & a)953 template<typename T> inline T do_neg (const T &a) { return -a; }
do_add(const T & a,const T & b)954 template<typename T> inline T do_add (const T &a, const T &b) { return a+b; }
do_sub(const T & a,const T & b)955 template<typename T> inline T do_sub (const T &a, const T &b) { return a-b; }
do_mul(const T & a,const U & b)956 template<typename T, typename U=T> inline auto do_mul (const T &a, const U &b) -> decltype(a*b) { return a*b; }
do_div(const T & a,const T & b)957 template<typename T> inline T do_div (const T &a, const T &b) { return a/b; }
do_safe_div(const T & a,const T & b)958 template<typename T> inline T do_safe_div (const T &a, const T &b) { return T(safe_div(a,b)); }
add_vec_simd(const Imath::V3f & a,const Imath::V3f & b)959 inline Imath::V3f add_vec_simd (const Imath::V3f &a, const Imath::V3f &b) {
960     return (vfloat3(a)+vfloat3(b)).V3f();
961 }
do_abs(const T & a)962 template<typename T> inline T do_abs (const T &a) { return abs(a); }
963 
964 
965 template<typename VEC>
test_arithmetic()966 void test_arithmetic ()
967 {
968     typedef typename VEC::value_t ELEM;
969     test_heading ("arithmetic ", VEC::type_name());
970 
971     VEC a = VEC::Iota (1, 3);
972     VEC b = VEC::Iota (1, 1);
973     VEC add(ELEM(0)), sub(ELEM(0)), mul(ELEM(0)), div(ELEM(0));
974     ELEM bsum(ELEM(0));
975     for (int i = 0; i < VEC::elements; ++i) {
976         add[i] = a[i] + b[i];
977         sub[i] = a[i] - b[i];
978         mul[i] = a[i] * b[i];
979         div[i] = a[i] / b[i];
980         bsum += b[i];
981     }
982     OIIO_CHECK_SIMD_EQUAL (a+b, add);
983     OIIO_CHECK_SIMD_EQUAL (a-b, sub);
984     OIIO_CHECK_SIMD_EQUAL (a*b, mul);
985     OIIO_CHECK_SIMD_EQUAL (a/b, div);
986     OIIO_CHECK_SIMD_EQUAL (a*ELEM(2), a*VEC(ELEM(2)));
987     OIIO_CHECK_SIMD_EQUAL (ELEM(2)*a, a*VEC(ELEM(2)));
988     { VEC r = a; r += b; OIIO_CHECK_SIMD_EQUAL (r, add); }
989     { VEC r = a; r -= b; OIIO_CHECK_SIMD_EQUAL (r, sub); }
990     { VEC r = a; r *= b; OIIO_CHECK_SIMD_EQUAL (r, mul); }
991     { VEC r = a; r /= b; OIIO_CHECK_SIMD_EQUAL (r, div); }
992     { VEC r = a; r *= ELEM(2); OIIO_CHECK_SIMD_EQUAL (r, a*ELEM(2)); }
993     // Test to make sure * works for negative 32 bit ints on all SIMD levels,
994     // because it's a different code path for sse2.
995     VEC negA = mkvec<VEC>(-1, 1, -2, 2);
996     VEC negB = mkvec<VEC>(2, 2, -2, -2);
997     OIIO_CHECK_SIMD_EQUAL(negA * negB, mkvec<VEC>(-2, 2, 4, -4));
998 
999     OIIO_CHECK_EQUAL (reduce_add(b), bsum);
1000     OIIO_CHECK_SIMD_EQUAL (vreduce_add(b), VEC(bsum));
1001     OIIO_CHECK_EQUAL (reduce_add(VEC(1.0f)), SimdElements<VEC>::size);
1002 
1003     benchmark2 ("operator+", do_add<VEC>, a, b);
1004     benchmark2 ("operator-", do_sub<VEC>, a, b);
1005     benchmark  ("operator- (neg)", do_neg<VEC>, a);
1006     benchmark2 ("operator*", do_mul<VEC>, a, b);
1007     benchmark2 ("operator* (scalar)", do_mul<VEC,ELEM>, a, ELEM(2));
1008     benchmark2 ("operator/", do_div<VEC>, a, b);
1009     benchmark  ("abs", do_abs<VEC>, a);
1010     benchmark  ("reduce_add", [](const VEC& a){ return vreduce_add(a); }, a);
1011     if (is_same<VEC,vfloat3>::value) {  // For vfloat3, compare to Imath
1012         Imath::V3f a(2.51f,1.0f,1.0f), b(3.1f,1.0f,1.0f);
1013         benchmark2 ("add Imath::V3f", do_add<Imath::V3f>, a, b, 3 /*work*/);
1014         benchmark2 ("add Imath::V3f with simd", add_vec_simd, a, b, 3 /*work*/);
1015         benchmark2 ("sub Imath::V3f", do_sub<Imath::V3f>, a, b, 3 /*work*/);
1016         benchmark2 ("mul Imath::V3f", do_mul<Imath::V3f>, a, b, 3 /*work*/);
1017         benchmark2 ("div Imath::V3f", do_div<Imath::V3f>, a, b, 3 /*work*/);
1018     }
1019     benchmark2 ("reference: add scalar", do_add<ELEM>, a[2], b[1]);
1020     benchmark2 ("reference: mul scalar", do_mul<ELEM>, a[2], b[1]);
1021     benchmark2 ("reference: div scalar", do_div<ELEM>, a[2], b[1]);
1022 }
1023 
1024 
1025 
1026 template<typename VEC>
test_fused()1027 void test_fused ()
1028 {
1029     test_heading ("fused ", VEC::type_name());
1030 
1031     VEC a = VEC::Iota (10);
1032     VEC b = VEC::Iota (1);
1033     VEC c = VEC::Iota (0.5f);
1034     OIIO_CHECK_SIMD_EQUAL (madd (a, b, c), a*b+c);
1035     OIIO_CHECK_SIMD_EQUAL (msub (a, b, c), a*b-c);
1036     OIIO_CHECK_SIMD_EQUAL (nmadd (a, b, c), -(a*b)+c);
1037     OIIO_CHECK_SIMD_EQUAL (nmsub (a, b, c), -(a*b)-c);
1038 
1039     benchmark2 ("madd old *+", [&](const VEC& a, const VEC& b){ return a*b+c; }, a, b);
1040     benchmark2 ("madd fused", [&](const VEC& a, const VEC& b){ return madd(a,b,c); }, a, b);
1041     benchmark2 ("msub old *-", [&](const VEC& a, const VEC& b){ return a*b-c; }, a, b);
1042     benchmark2 ("msub fused", [&](const VEC& a, const VEC& b){ return msub(a,b,c); }, a, b);
1043     benchmark2 ("nmadd old (-*)+", [&](const VEC& a, const VEC& b){ return c-(a*b); }, a, b);
1044     benchmark2 ("nmadd fused", [&](const VEC& a, const VEC& b){ return nmadd(a,b,c); }, a, b);
1045     benchmark2 ("nmsub old -(*+)", [&](const VEC& a, const VEC& b){ return -(a*b)-c; }, a, b);
1046     benchmark2 ("nmsub fused", [&](const VEC& a, const VEC& b){ return nmsub(a,b,c); }, a, b);
1047 }
1048 
1049 
1050 
do_and(const T & a,const T & b)1051 template<typename T> T do_and (const T& a, const T& b) { return a & b; }
do_or(const T & a,const T & b)1052 template<typename T> T do_or  (const T& a, const T& b) { return a | b; }
do_xor(const T & a,const T & b)1053 template<typename T> T do_xor (const T& a, const T& b) { return a ^ b; }
do_compl(const T & a)1054 template<typename T> T do_compl (const T& a) { return ~a; }
do_andnot(const T & a,const T & b)1055 template<typename T> T do_andnot (const T& a, const T& b) { return andnot(a,b); }
1056 
1057 
1058 
1059 template<typename VEC>
1060 void
test_bitwise_int()1061 test_bitwise_int()
1062 {
1063     test_heading("bitwise ", VEC::type_name());
1064 
1065     VEC a(0x12341234);
1066     VEC b(0x11111111);
1067     OIIO_CHECK_SIMD_EQUAL(a & b, VEC(0x10101010));
1068     OIIO_CHECK_SIMD_EQUAL(a | b, VEC(0x13351335));
1069     OIIO_CHECK_SIMD_EQUAL(a ^ b, VEC(0x03250325));
1070     OIIO_CHECK_SIMD_EQUAL(~(a), VEC(0xedcbedcb));
1071     OIIO_CHECK_SIMD_EQUAL(andnot(b, a), (~(b)) & a);
1072     OIIO_CHECK_SIMD_EQUAL(andnot(b, a), VEC(0x02240224));
1073 
1074     VEC atest(15);
1075     atest[1] = 7;
1076     OIIO_CHECK_EQUAL(reduce_and(atest), 7);
1077 
1078     VEC otest(0);
1079     otest[1] = 3;
1080     otest[2] = 4;
1081     OIIO_CHECK_EQUAL(reduce_or(otest), 7);
1082 
1083     benchmark2("operator&", do_and<VEC>, a, b);
1084     benchmark2("operator|", do_or<VEC>, a, b);
1085     benchmark2("operator^", do_xor<VEC>, a, b);
1086     benchmark("operator!", do_compl<VEC>, a);
1087     benchmark2("andnot", do_andnot<VEC>, a, b);
1088     benchmark("reduce_and", [](const VEC& a) { return reduce_and(a); }, a);
1089     benchmark("reduce_or ", [](const VEC& a) { return reduce_or(a); }, a);
1090 }
1091 
1092 
1093 
1094 template<typename VEC>
test_bitwise_bool()1095 void test_bitwise_bool ()
1096 {
1097     test_heading ("bitwise ", VEC::type_name());
1098 
1099     bool A[]   = { true,  true,  false, false, false, false, true,  true,
1100                    true,  true,  false, false, false, false, true,  true  };
1101     bool B[]   = { true,  false, true,  false, true,  false, true,  false,
1102                    true,  false, true,  false, true,  false, true,  false };
1103     bool AND[] = { true,  false, false, false, false, false, true,  false,
1104                    true,  false, false, false, false, false, true,  false };
1105     bool OR[]  = { true,  true,  true,  false, true,  false, true,  true,
1106                    true,  true,  true,  false, true,  false, true,  true  };
1107     bool XOR[] = { false, true,  true,  false, true,  false, false, true,
1108                    false, true,  true,  false, true,  false, false, true  };
1109     bool NOT[] = { false, false, true,  true,  true,  true,  false, false,
1110                    false, false, true,  true,  true,  true,  false, false  };
1111     VEC a(A), b(B), rand(AND), ror(OR), rxor(XOR), rnot(NOT);
1112     OIIO_CHECK_SIMD_EQUAL (a & b, rand);
1113     OIIO_CHECK_SIMD_EQUAL (a | b, ror);
1114     OIIO_CHECK_SIMD_EQUAL (a ^ b, rxor);
1115     OIIO_CHECK_SIMD_EQUAL (~a, rnot);
1116 
1117     VEC onebit(false); onebit.setcomp(3,true);
1118     OIIO_CHECK_EQUAL (reduce_or(VEC::False()), false);
1119     OIIO_CHECK_EQUAL (reduce_or(onebit), true);
1120     OIIO_CHECK_EQUAL (reduce_and(VEC::True()), true);
1121     OIIO_CHECK_EQUAL (reduce_and(onebit), false);
1122     OIIO_CHECK_EQUAL (all(VEC::True()), true);
1123     OIIO_CHECK_EQUAL (any(VEC::True()), true);
1124     OIIO_CHECK_EQUAL (none(VEC::True()), false);
1125     OIIO_CHECK_EQUAL (all(VEC::False()), false);
1126     OIIO_CHECK_EQUAL (any(VEC::False()), false);
1127     OIIO_CHECK_EQUAL (none(VEC::False()), true);
1128 
1129     benchmark2 ("operator&", do_and<VEC>, a, b);
1130     benchmark2 ("operator|", do_or<VEC>, a, b);
1131     benchmark2 ("operator^", do_xor<VEC>, a, b);
1132     benchmark  ("operator!", do_compl<VEC>, a);
1133     benchmark  ("reduce_and", [](const VEC& a){ return reduce_and(a); }, a);
1134     benchmark  ("reduce_or ", [](const VEC& a){ return reduce_or(a); }, a);
1135 }
1136 
1137 
1138 
do_lt(const T & a,const T & b)1139 template<class T, class B> B do_lt (const T& a, const T& b) { return a < b; }
do_gt(const T & a,const T & b)1140 template<class T, class B> B do_gt (const T& a, const T& b) { return a > b; }
do_le(const T & a,const T & b)1141 template<class T, class B> B do_le (const T& a, const T& b) { return a <= b; }
do_ge(const T & a,const T & b)1142 template<class T, class B> B do_ge (const T& a, const T& b) { return a >= b; }
do_eq(const T & a,const T & b)1143 template<class T, class B> B do_eq (const T& a, const T& b) { return a == b; }
do_ne(const T & a,const T & b)1144 template<class T, class B> B do_ne (const T& a, const T& b) { return a != b; }
1145 
1146 
1147 
1148 template<typename VEC>
1149 void
test_comparisons()1150 test_comparisons()
1151 {
1152     typedef typename VEC::value_t ELEM;
1153     typedef typename VEC::vbool_t bool_t;
1154     test_heading("comparisons ", VEC::type_name());
1155 
1156     VEC a      = VEC::Iota();
1157     bool lt2[] = { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1158     bool gt2[] = { 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
1159     bool le2[] = { 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1160     bool ge2[] = { 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
1161     bool eq2[] = { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1162     bool ne2[] = { 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
1163     OIIO_CHECK_SIMD_EQUAL((a < 2), bool_t(lt2));
1164     OIIO_CHECK_SIMD_EQUAL((a > 2), bool_t(gt2));
1165     OIIO_CHECK_SIMD_EQUAL((a <= 2), bool_t(le2));
1166     OIIO_CHECK_SIMD_EQUAL((a >= 2), bool_t(ge2));
1167     OIIO_CHECK_SIMD_EQUAL((a == 2), bool_t(eq2));
1168     OIIO_CHECK_SIMD_EQUAL((a != 2), bool_t(ne2));
1169     VEC b(ELEM(2));
1170     OIIO_CHECK_SIMD_EQUAL((a < b), bool_t(lt2));
1171     OIIO_CHECK_SIMD_EQUAL((a > b), bool_t(gt2));
1172     OIIO_CHECK_SIMD_EQUAL((a <= b), bool_t(le2));
1173     OIIO_CHECK_SIMD_EQUAL((a >= b), bool_t(ge2));
1174     OIIO_CHECK_SIMD_EQUAL((a == b), bool_t(eq2));
1175     OIIO_CHECK_SIMD_EQUAL((a != b), bool_t(ne2));
1176 
1177     benchmark2("operator< ", do_lt<VEC, bool_t>, a, b);
1178     benchmark2("operator> ", do_gt<VEC, bool_t>, a, b);
1179     benchmark2("operator<=", do_le<VEC, bool_t>, a, b);
1180     benchmark2("operator>=", do_ge<VEC, bool_t>, a, b);
1181     benchmark2("operator==", do_eq<VEC, bool_t>, a, b);
1182     benchmark2("operator!=", do_ne<VEC, bool_t>, a, b);
1183 }
1184 
1185 
1186 
1187 template<typename VEC>
1188 void
test_shuffle4()1189 test_shuffle4()
1190 {
1191     typedef typename VEC::value_t ELEM;
1192     test_heading("shuffle ", VEC::type_name());
1193 
1194     VEC a(0, 1, 2, 3);
1195     OIIO_CHECK_SIMD_EQUAL((shuffle<3, 2, 1, 0>(a)), VEC(3, 2, 1, 0));
1196     OIIO_CHECK_SIMD_EQUAL((shuffle<0, 0, 2, 2>(a)), VEC(0, 0, 2, 2));
1197     OIIO_CHECK_SIMD_EQUAL((shuffle<1, 1, 3, 3>(a)), VEC(1, 1, 3, 3));
1198     OIIO_CHECK_SIMD_EQUAL((shuffle<0, 1, 0, 1>(a)), VEC(0, 1, 0, 1));
1199     OIIO_CHECK_SIMD_EQUAL((shuffle<2>(a)), VEC(ELEM(2)));
1200 
1201     benchmark("shuffle<...> ",
1202               [&](const VEC& v) { return shuffle<3, 2, 1, 0>(v); }, a);
1203     benchmark("shuffle<0> ", [&](const VEC& v) { return shuffle<0>(v); }, a);
1204     benchmark("shuffle<1> ", [&](const VEC& v) { return shuffle<1>(v); }, a);
1205     benchmark("shuffle<2> ", [&](const VEC& v) { return shuffle<2>(v); }, a);
1206     benchmark("shuffle<3> ", [&](const VEC& v) { return shuffle<3>(v); }, a);
1207 }
1208 
1209 
1210 
1211 template<typename VEC>
test_shuffle8()1212 void test_shuffle8 ()
1213 {
1214     typedef typename VEC::value_t ELEM;
1215     test_heading ("shuffle ", VEC::type_name());
1216     VEC a (0, 1, 2, 3, 4, 5, 6, 7);
1217     OIIO_CHECK_SIMD_EQUAL ((shuffle<3,2,1,0,3,2,1,0>(a)), VEC(3,2,1,0,3,2,1,0));
1218     OIIO_CHECK_SIMD_EQUAL ((shuffle<0,0,2,2,0,0,2,2>(a)), VEC(0,0,2,2,0,0,2,2));
1219     OIIO_CHECK_SIMD_EQUAL ((shuffle<1,1,3,3,1,1,3,3>(a)), VEC(1,1,3,3,1,1,3,3));
1220     OIIO_CHECK_SIMD_EQUAL ((shuffle<0,1,0,1,0,1,0,1>(a)), VEC(0,1,0,1,0,1,0,1));
1221     OIIO_CHECK_SIMD_EQUAL ((shuffle<2>(a)), VEC(ELEM(2)));
1222 
1223     benchmark ("shuffle<...> ", [&](const VEC& v){ return shuffle<7,6,5,4,3,2,1,0>(v); }, a);
1224     benchmark ("shuffle<0> ", [&](const VEC& v){ return shuffle<0>(v); }, a);
1225     benchmark ("shuffle<1> ", [&](const VEC& v){ return shuffle<1>(v); }, a);
1226     benchmark ("shuffle<2> ", [&](const VEC& v){ return shuffle<2>(v); }, a);
1227     benchmark ("shuffle<3> ", [&](const VEC& v){ return shuffle<3>(v); }, a);
1228     benchmark ("shuffle<4> ", [&](const VEC& v){ return shuffle<4>(v); }, a);
1229     benchmark ("shuffle<5> ", [&](const VEC& v){ return shuffle<5>(v); }, a);
1230     benchmark ("shuffle<6> ", [&](const VEC& v){ return shuffle<6>(v); }, a);
1231     benchmark ("shuffle<7> ", [&](const VEC& v){ return shuffle<7>(v); }, a);
1232 }
1233 
1234 
1235 
1236 template<typename VEC>
test_shuffle16()1237 void test_shuffle16 ()
1238 {
1239     test_heading ("shuffle ", VEC::type_name());
1240     VEC a (0, 1, 2, 3, 4, 5, 6, 7,  8, 9, 10, 11, 12, 13, 14, 15);
1241 
1242     // Shuffle groups of 4
1243     OIIO_CHECK_SIMD_EQUAL ((shuffle4<3,2,1,0>(a)),
1244                            VEC(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3));
1245     OIIO_CHECK_SIMD_EQUAL ((shuffle4<3>(a)),
1246                            VEC(12,13,14,15,12,13,14,15,12,13,14,15,12,13,14,15));
1247 
1248     // Shuffle within groups of 4
1249     OIIO_CHECK_SIMD_EQUAL ((shuffle<3,2,1,0>(a)),
1250                            VEC(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12));
1251     OIIO_CHECK_SIMD_EQUAL ((shuffle<3>(a)),
1252                            VEC(3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15));
1253 
1254     benchmark ("shuffle4<> ", [&](const VEC& v){ return shuffle<3,2,1,0>(v); }, a);
1255     benchmark ("shuffle<> ",  [&](const VEC& v){ return shuffle<3,2,1,0>(v); }, a);
1256 }
1257 
1258 
1259 
1260 template<typename VEC>
1261 void
test_swizzle()1262 test_swizzle()
1263 {
1264     test_heading("swizzle ", VEC::type_name());
1265 
1266     VEC a = VEC::Iota(0);
1267     VEC b = VEC::Iota(10);
1268     OIIO_CHECK_SIMD_EQUAL(AxyBxy(a, b), VEC(0, 1, 10, 11));
1269     OIIO_CHECK_SIMD_EQUAL(AxBxAyBy(a, b), VEC(0, 10, 1, 11));
1270     OIIO_CHECK_SIMD_EQUAL(b.xyz0(), VEC(10, 11, 12, 0));
1271     OIIO_CHECK_SIMD_EQUAL(b.xyz1(), VEC(10, 11, 12, 1));
1272 }
1273 
1274 
1275 
1276 template<typename VEC>
test_blend()1277 void test_blend ()
1278 {
1279     test_heading ("blend ", VEC::type_name());
1280     typedef typename VEC::value_t ELEM;
1281     typedef typename VEC::vbool_t bool_t;
1282 
1283     VEC a = VEC::Iota (1);
1284     VEC b = VEC::Iota (10);
1285     bool_t f(false), t(true);
1286     bool tf_values[] = { true, false, true, false, true, false, true, false,
1287                          true, false, true, false, true, false, true, false };
1288     bool_t tf ((bool *)tf_values);
1289 
1290     OIIO_CHECK_SIMD_EQUAL (blend (a, b, f), a);
1291     OIIO_CHECK_SIMD_EQUAL (blend (a, b, t), b);
1292 
1293     ELEM r1[] = { 10, 2, 12, 4, 14, 6, 16, 8,  18, 10, 20, 12, 22, 14, 24, 16 };
1294     OIIO_CHECK_SIMD_EQUAL (blend (a, b, tf), VEC(r1));
1295 
1296     OIIO_CHECK_SIMD_EQUAL (blend0 (a, f), VEC::Zero());
1297     OIIO_CHECK_SIMD_EQUAL (blend0 (a, t), a);
1298     ELEM r2[] = { 1, 0, 3, 0, 5, 0, 7, 0,  9, 0, 11, 0, 13, 0, 15, 0 };
1299     OIIO_CHECK_SIMD_EQUAL (blend0 (a, tf), VEC(r2));
1300 
1301     OIIO_CHECK_SIMD_EQUAL (blend0not (a, f), a);
1302     OIIO_CHECK_SIMD_EQUAL (blend0not (a, t), VEC::Zero());
1303     ELEM r3[] = { 0, 2, 0, 4, 0, 6, 0, 8,  0, 10, 0, 12, 0, 14, 0, 16 };
1304     OIIO_CHECK_SIMD_EQUAL (blend0not (a, tf), VEC(r3));
1305 
1306     benchmark2 ("blend", [&](const VEC& a, const VEC& b){ return blend(a,b,tf); }, a, b);
1307     benchmark2 ("blend0", [](const VEC& a, const bool_t& b){ return blend0(a,b); }, a, tf);
1308     benchmark2 ("blend0not", [](const VEC& a, const bool_t& b){ return blend0not(a,b); }, a, tf);
1309 }
1310 
1311 
1312 
1313 template<typename VEC>
1314 void
test_transpose4()1315 test_transpose4()
1316 {
1317     test_heading("transpose ", VEC::type_name());
1318 
1319     VEC a(0, 1, 2, 3);
1320     VEC b(4, 5, 6, 7);
1321     VEC c(8, 9, 10, 11);
1322     VEC d(12, 13, 14, 15);
1323 
1324     OIIO_CHECK_SIMD_EQUAL(AxBxCxDx(a, b, c, d), VEC(0, 4, 8, 12));
1325 
1326     std::cout << " before transpose:\n";
1327     std::cout << "\t" << a << "\n";
1328     std::cout << "\t" << b << "\n";
1329     std::cout << "\t" << c << "\n";
1330     std::cout << "\t" << d << "\n";
1331     transpose(a, b, c, d);
1332     std::cout << " after transpose:\n";
1333     std::cout << "\t" << a << "\n";
1334     std::cout << "\t" << b << "\n";
1335     std::cout << "\t" << c << "\n";
1336     std::cout << "\t" << d << "\n";
1337     OIIO_CHECK_SIMD_EQUAL(a, VEC(0, 4, 8, 12));
1338     OIIO_CHECK_SIMD_EQUAL(b, VEC(1, 5, 9, 13));
1339     OIIO_CHECK_SIMD_EQUAL(c, VEC(2, 6, 10, 14));
1340     OIIO_CHECK_SIMD_EQUAL(d, VEC(3, 7, 11, 15));
1341 }
1342 
1343 
1344 
do_shl(const T & a,int b)1345 template<typename T> inline T do_shl (const T &a, int b) { return a<<b; }
do_shr(const T & a,int b)1346 template<typename T> inline T do_shr (const T &a, int b) { return a>>b; }
do_srl(const T & a,int b)1347 template<typename T> inline T do_srl (const T &a, int b) { return srl(a,b); }
do_rotl(const T & a,int b)1348 template<typename T> inline T do_rotl (const T &a, int b) { return rotl(a,b); }
1349 
1350 
1351 template<typename VEC>
1352 void
test_shift()1353 test_shift()
1354 {
1355     test_heading("shift ", VEC::type_name());
1356 
1357     // Basics of << and >>
1358     VEC i = VEC::Iota(10, 10);  // 10, 20, 30 ...
1359     OIIO_CHECK_SIMD_EQUAL(i << 2, VEC::Iota(40, 40));
1360     OIIO_CHECK_SIMD_EQUAL(i >> 1, VEC::Iota(5, 5));
1361 
1362     // Tricky cases with high bits, and the difference between >> and srl
1363     int vals[4] = { 1 << 31, -1, 0xffff, 3 };
1364     for (auto hard : vals) {
1365         VEC vhard(hard);
1366         OIIO_CHECK_SIMD_EQUAL (vhard >> 1, VEC(hard>>1));
1367         OIIO_CHECK_SIMD_EQUAL (srl(vhard,1), VEC(unsigned(hard)>>1));
1368         Strutil::printf("  [%x] >>  1 == [%x]\n", vhard, vhard>>1);
1369         Strutil::printf("  [%x] srl 1 == [%x]\n", vhard, srl(vhard,1));
1370         OIIO_CHECK_SIMD_EQUAL (srl(vhard,4), VEC(unsigned(hard)>>4));
1371         Strutil::printf("  [%x] >>  4 == [%x]\n", vhard, vhard>>4);
1372         Strutil::printf("  [%x] srl 4 == [%x]\n", vhard, srl(vhard,4));
1373     }
1374 
1375     // Test <<= and >>=
1376     i = VEC::Iota (10, 10);   i <<= 2;
1377     OIIO_CHECK_SIMD_EQUAL (i, VEC::Iota(40, 40));
1378     i = VEC::Iota (10, 10);   i >>= 1;
1379     OIIO_CHECK_SIMD_EQUAL (i, VEC::Iota(5, 5));
1380 
1381     // Test rotl
1382     {
1383         vint4 v (0x12345678, 0xabcdef01, 0x98765432, 0x31415926);
1384         vint4 r (0x23456781, 0xbcdef01a, 0x87654329, 0x14159263);
1385         OIIO_CHECK_SIMD_EQUAL (rotl(v,4), r);
1386     }
1387 
1388     // Benchmark
1389     benchmark2 ("operator<<", do_shl<VEC>, i, 2);
1390     benchmark2 ("operator>>", do_shr<VEC>, i, 2);
1391     benchmark2 ("srl       ", do_srl<VEC>, i, 2);
1392     benchmark2 ("rotl      ", do_rotl<VEC>, i, 2);
1393 }
1394 
1395 
1396 
1397 void
test_vectorops_vfloat4()1398 test_vectorops_vfloat4()
1399 {
1400     typedef vfloat4 VEC;
1401     typedef VEC::value_t ELEM;
1402     test_heading("vectorops ", VEC::type_name());
1403 
1404     VEC a = mkvec<VEC> (10, 11, 12, 13);
1405     VEC b = mkvec<VEC> (1, 2, 3, 4);
1406     OIIO_CHECK_EQUAL (dot(a,b), ELEM(10+22+36+52));
1407     OIIO_CHECK_EQUAL (dot3(a,b), ELEM(10+22+36));
1408     OIIO_CHECK_SIMD_EQUAL (vdot(a,b), VEC(10+22+36+52));
1409     OIIO_CHECK_SIMD_EQUAL (vdot3(a,b), VEC(10+22+36));
1410     OIIO_CHECK_SIMD_EQUAL (hdiv(vfloat4(1.0f,2.0f,3.0f,2.0f)), vfloat3(0.5f,1.0f,1.5f));
1411 
1412     benchmark2 ("vdot", [](const VEC& a, const VEC& b){ return vdot(a,b); }, a, b);
1413     benchmark2 ("dot", [](const VEC& a, const VEC& b){ return dot(a,b); }, a, b);
1414     benchmark2 ("vdot3", [](const VEC& a, const VEC& b){ return vdot3(a,b); }, a, b);
1415     benchmark2 ("dot3", [](const VEC& a, const VEC& b){ return dot3(a,b); }, a, b);
1416 }
1417 
1418 
1419 
test_vectorops_vfloat3()1420 void test_vectorops_vfloat3 ()
1421 {
1422     typedef vfloat3 VEC;
1423     typedef VEC::value_t ELEM;
1424     test_heading ("vectorops ", VEC::type_name());
1425 
1426     VEC a = mkvec<VEC> (10, 11, 12);
1427     VEC b = mkvec<VEC> (1, 2, 3);
1428     OIIO_CHECK_EQUAL (dot(a,b), ELEM(10+22+36));
1429     OIIO_CHECK_EQUAL (dot3(a,b), ELEM(10+22+36));
1430     OIIO_CHECK_SIMD_EQUAL (vdot(a,b), VEC(10+22+36));
1431     OIIO_CHECK_SIMD_EQUAL (vdot3(a,b), VEC(10+22+36));
1432     OIIO_CHECK_SIMD_EQUAL (vfloat3(1.0f,2.0f,3.0f).normalized(),
1433                            vfloat3(norm_imath(Imath::V3f(1.0f,2.0f,3.0f))));
1434     OIIO_CHECK_SIMD_EQUAL_THRESH (vfloat3(1.0f,2.0f,3.0f).normalized_fast(),
1435                                   vfloat3(norm_imath(Imath::V3f(1.0f,2.0f,3.0f))), 0.0005);
1436 
1437     benchmark2 ("vdot", [](const VEC& a, const VEC& b){ return vdot(a,b); }, a, b);
1438     benchmark2 ("dot", [](const VEC& a, const VEC& b){ return dot(a,b); }, a, b);
1439     benchmark ("dot vfloat3", dot_simd, vfloat3(2.0f,1.0f,0.0f), 1);
1440     // benchmark2 ("dot Imath::V3f", [](Imath::V3f& a, Imath::V3f& b){ return a.dot(b); }, a.V3f(), b.V3f());
1441     benchmark ("dot Imath::V3f", dot_imath, Imath::V3f(2.0f,1.0f,0.0f), 1);
1442     benchmark ("dot Imath::V3f with simd", dot_imath_simd, Imath::V3f(2.0f,1.0f,0.0f), 1);
1443     benchmark ("normalize Imath", norm_imath, Imath::V3f(1.0f,4.0f,9.0f));
1444     benchmark ("normalize Imath with simd", norm_imath_simd, Imath::V3f(1.0f,4.0f,9.0f));
1445     benchmark ("normalize Imath with simd fast", norm_imath_simd_fast, Imath::V3f(1.0f,4.0f,9.0f));
1446     benchmark ("normalize simd", norm_simd, vfloat3(1.0f,4.0f,9.0f));
1447     benchmark ("normalize simd fast", norm_simd_fast, vfloat3(1.0f,4.0f,9.0f));
1448 }
1449 
1450 
1451 
test_constants()1452 void test_constants ()
1453 {
1454     test_heading ("constants");
1455 
1456     OIIO_CHECK_SIMD_EQUAL (vbool4::False(), vbool4(false));
1457     OIIO_CHECK_SIMD_EQUAL (vbool4::True(), vbool4(true));
1458 
1459     OIIO_CHECK_SIMD_EQUAL (vbool8::False(), vbool8(false));
1460     OIIO_CHECK_SIMD_EQUAL (vbool8::True(), vbool8(true));
1461 
1462     OIIO_CHECK_SIMD_EQUAL (vbool16::False(), vbool16(false));
1463     OIIO_CHECK_SIMD_EQUAL (vbool16::True(), vbool16(true));
1464     OIIO_CHECK_SIMD_EQUAL (vbool16::False(), vbool16(false));
1465     OIIO_CHECK_SIMD_EQUAL (vbool16::True(), vbool16(true));
1466 
1467     OIIO_CHECK_SIMD_EQUAL (vint4::Zero(), vint4(0));
1468     OIIO_CHECK_SIMD_EQUAL (vint4::One(), vint4(1));
1469     OIIO_CHECK_SIMD_EQUAL (vint4::NegOne(), vint4(-1));
1470     OIIO_CHECK_SIMD_EQUAL (vint4::Iota(), vint4(0,1,2,3));
1471     OIIO_CHECK_SIMD_EQUAL (vint4::Iota(3), vint4(3,4,5,6));
1472     OIIO_CHECK_SIMD_EQUAL (vint4::Iota(3,2), vint4(3,5,7,9));
1473     OIIO_CHECK_SIMD_EQUAL (vint4::Giota(), vint4(1,2,4,8));
1474 
1475     OIIO_CHECK_SIMD_EQUAL (vint8::Zero(), vint8(0));
1476     OIIO_CHECK_SIMD_EQUAL (vint8::One(), vint8(1));
1477     OIIO_CHECK_SIMD_EQUAL (vint8::NegOne(), vint8(-1));
1478     OIIO_CHECK_SIMD_EQUAL (vint8::Iota(), vint8(0,1,2,3, 4,5,6,7));
1479     OIIO_CHECK_SIMD_EQUAL (vint8::Iota(3), vint8(3,4,5,6, 7,8,9,10));
1480     OIIO_CHECK_SIMD_EQUAL (vint8::Iota(3,2), vint8(3,5,7,9, 11,13,15,17));
1481     OIIO_CHECK_SIMD_EQUAL (vint8::Giota(), vint8(1,2,4,8, 16,32,64,128));
1482 
1483     OIIO_CHECK_SIMD_EQUAL (vint16::Zero(), vint16(0));
1484     OIIO_CHECK_SIMD_EQUAL (vint16::One(), vint16(1));
1485     OIIO_CHECK_SIMD_EQUAL (vint16::NegOne(), vint16(-1));
1486     OIIO_CHECK_SIMD_EQUAL (vint16::Iota(), vint16(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15));
1487     OIIO_CHECK_SIMD_EQUAL (vint16::Iota(3), vint16(3,4,5,6, 7,8,9,10, 11,12,13,14, 15,16,17,18));
1488     OIIO_CHECK_SIMD_EQUAL (vint16::Iota(3,2), vint16(3,5,7,9, 11,13,15,17, 19,21,23,25, 27,29,31,33));
1489     OIIO_CHECK_SIMD_EQUAL (vint16::Giota(), vint16(1,2,4,8, 16,32,64,128, 256,512,1024,2048, 4096,8192,16384,32768));
1490 
1491     OIIO_CHECK_SIMD_EQUAL (vfloat4::Zero(), vfloat4(0.0f));
1492     OIIO_CHECK_SIMD_EQUAL (vfloat4::One(), vfloat4(1.0f));
1493     OIIO_CHECK_SIMD_EQUAL (vfloat4::Iota(), vfloat4(0,1,2,3));
1494     OIIO_CHECK_SIMD_EQUAL (vfloat4::Iota(3.0f), vfloat4(3,4,5,6));
1495     OIIO_CHECK_SIMD_EQUAL (vfloat4::Iota(3.0f,2.0f), vfloat4(3,5,7,9));
1496 
1497     OIIO_CHECK_SIMD_EQUAL (vfloat3::Zero(), vfloat3(0.0f));
1498     OIIO_CHECK_SIMD_EQUAL (vfloat3::One(), vfloat3(1.0f));
1499     OIIO_CHECK_SIMD_EQUAL (vfloat3::Iota(), vfloat3(0,1,2));
1500     OIIO_CHECK_SIMD_EQUAL (vfloat3::Iota(3.0f), vfloat3(3,4,5));
1501     OIIO_CHECK_SIMD_EQUAL (vfloat3::Iota(3.0f,2.0f), vfloat3(3,5,7));
1502 
1503     OIIO_CHECK_SIMD_EQUAL (vfloat8::Zero(), vfloat8(0.0f));
1504     OIIO_CHECK_SIMD_EQUAL (vfloat8::One(), vfloat8(1.0f));
1505     OIIO_CHECK_SIMD_EQUAL (vfloat8::Iota(), vfloat8(0,1,2,3,4,5,6,7));
1506     OIIO_CHECK_SIMD_EQUAL (vfloat8::Iota(3.0f), vfloat8(3,4,5,6,7,8,9,10));
1507     OIIO_CHECK_SIMD_EQUAL (vfloat8::Iota(3.0f,2.0f), vfloat8(3,5,7,9,11,13,15,17));
1508 
1509     OIIO_CHECK_SIMD_EQUAL (vfloat16::Zero(), vfloat16(0.0f));
1510     OIIO_CHECK_SIMD_EQUAL (vfloat16::One(), vfloat16(1.0f));
1511     OIIO_CHECK_SIMD_EQUAL (vfloat16::Iota(), vfloat16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15));
1512     OIIO_CHECK_SIMD_EQUAL (vfloat16::Iota(3.0f), vfloat16(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18));
1513     OIIO_CHECK_SIMD_EQUAL (vfloat16::Iota(3.0f,2.0f), vfloat16(3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33));
1514 
1515     benchmark ("vfloat4 = float(const)", [](float f){ return vfloat4(f); }, 1.0f);
1516     benchmark ("vfloat4 = Zero()", [](int){ return vfloat4::Zero(); }, 0);
1517     benchmark ("vfloat4 = One()", [](int){ return vfloat4::One(); }, 0);
1518     benchmark ("vfloat4 = Iota()", [](int){ return vfloat4::Iota(); }, 0);
1519 
1520     benchmark ("vfloat8 = float(const)", [](float f){ return vfloat8(f); }, 1.0f);
1521     benchmark ("vfloat8 = Zero()", [](int){ return vfloat8::Zero(); }, 0);
1522     benchmark ("vfloat8 = One()", [](int){ return vfloat8::One(); }, 0);
1523     benchmark ("vfloat8 = Iota()", [](int){ return vfloat8::Iota(); }, 0);
1524 
1525     benchmark ("vfloat16 = float(const)", [](float f){ return vfloat16(f); }, 1.0f);
1526     benchmark ("vfloat16 = Zero()", [](int){ return vfloat16::Zero(); }, 0);
1527     benchmark ("vfloat16 = One()", [](int){ return vfloat16::One(); }, 0);
1528     benchmark ("vfloat16 = Iota()", [](int){ return vfloat16::Iota(); }, 0);
1529 }
1530 
1531 
1532 
1533 // Miscellaneous one-off stuff not caught by other tests
1534 void
test_special()1535 test_special()
1536 {
1537     test_heading("special");
1538     {
1539         // Make sure a vfloat4 constructed from saturated unsigned short,
1540         // short, unsigned char, or char values, then divided by the float
1541         // max, exactly equals 1.0.
1542         short s32767[] = {32767, 32767, 32767, 32767};
1543         unsigned short us65535[] = {65535, 65535, 65535, 65535};
1544         char c127[] = {127, 127, 127, 127};
1545         unsigned char uc255[] = {255, 255, 255, 255};
1546         OIIO_CHECK_SIMD_EQUAL (vfloat4(us65535)/vfloat4(65535.0), vfloat4(1.0f));
1547         OIIO_CHECK_SIMD_EQUAL (vfloat4(us65535)*vfloat4(1.0f/65535.0), vfloat4(1.0f));
1548         OIIO_CHECK_SIMD_EQUAL (vfloat4(s32767)/vfloat4(32767.0), vfloat4(1.0f));
1549         OIIO_CHECK_SIMD_EQUAL (vfloat4(s32767)*vfloat4(1.0f/32767.0), vfloat4(1.0f));
1550         OIIO_CHECK_SIMD_EQUAL (vfloat4(uc255)/vfloat4(255.0), vfloat4(1.0f));
1551         OIIO_CHECK_SIMD_EQUAL (vfloat4(uc255)*vfloat4(1.0f/255.0), vfloat4(1.0f));
1552         OIIO_CHECK_SIMD_EQUAL (vfloat4(c127)/vfloat4(127.0), vfloat4(1.0f));
1553         OIIO_CHECK_SIMD_EQUAL (vfloat4(c127)*vfloat4(1.0f/127.0), vfloat4(1.0f));
1554     }
1555 }
1556 
1557 
1558 
1559 // Wrappers to resolve the return type ambiguity
fast_exp_float(float x)1560 inline float fast_exp_float (float x) { return fast_exp(x); }
fast_exp_vfloat4(const vfloat4 & x)1561 inline vfloat4 fast_exp_vfloat4 (const vfloat4& x) { return fast_exp(x); }
fast_log_float(float x)1562 inline float fast_log_float (float x) { return fast_log(x); }
1563 //inline vfloat4 fast_log_float (const vfloat4& x) { return fast_log(x); }
rsqrtf(float f)1564 inline float rsqrtf (float f) { return 1.0f / sqrtf(f); }
rcp(float f)1565 inline float rcp (float f) { return 1.0f / f; }
1566 
1567 
1568 
1569 template<typename VEC>
test_mathfuncs()1570 void test_mathfuncs ()
1571 {
1572     typedef typename VEC::vint_t vint_t;
1573     test_heading ("mathfuncs", VEC::type_name());
1574 
1575     VEC F = mkvec<VEC> (-1.5f, 0.0f, 1.9f, 4.1f);
1576     OIIO_CHECK_SIMD_EQUAL (abs(F), mkvec<VEC>(std::abs(F[0]), std::abs(F[1]), std::abs(F[2]), std::abs(F[3])));
1577     // OIIO_CHECK_SIMD_EQUAL (sign(F), mkvec<VEC>(std::sign(F[0]), std::sign(F[1]), std::sign(F[2]), std::sign(F[3])));
1578     OIIO_CHECK_SIMD_EQUAL (ceil(F), mkvec<VEC>(std::ceil(F[0]), std::ceil(F[1]), std::ceil(F[2]), std::ceil(F[3])));
1579     OIIO_CHECK_SIMD_EQUAL (floor(F), mkvec<VEC>(std::floor(F[0]), std::floor(F[1]), std::floor(F[2]), std::floor(F[3])));
1580     OIIO_CHECK_SIMD_EQUAL (round(F), mkvec<VEC>(std::round(F[0]), std::round(F[1]), std::round(F[2]), std::round(F[3])));
1581     benchmark ("simd abs", [](const VEC& v){ return abs(v); }, 1.1f);
1582     benchmark ("simd sign", [](const VEC& v){ return sign(v); }, 1.1f);
1583     benchmark ("simd ceil", [](const VEC& v){ return ceil(v); }, 1.1f);
1584     benchmark ("simd floor", [](const VEC& v){ return floor(v); }, 1.1f);
1585     benchmark ("simd round", [](const VEC& v){ return round(v); }, 1.1f);
1586 
1587     VEC A = mkvec<VEC> (-1.0f, 0.0f, 1.0f, 4.5f);
1588     VEC expA = mkvec<VEC> (0.367879441171442f, 1.0f, 2.718281828459045f, 90.0171313005218f);
1589     OIIO_CHECK_SIMD_EQUAL (exp(A), expA);
1590     OIIO_CHECK_SIMD_EQUAL_THRESH (log(expA), A, 1e-6f);
1591     OIIO_CHECK_SIMD_EQUAL (fast_exp(A),
1592                 mkvec<VEC>(fast_exp(A[0]), fast_exp(A[1]), fast_exp(A[2]), fast_exp(A[3])));
1593     OIIO_CHECK_SIMD_EQUAL (fast_log(expA),
1594                 mkvec<VEC>(fast_log(expA[0]), fast_log(expA[1]), fast_log(expA[2]), fast_log(expA[3])));
1595     OIIO_CHECK_SIMD_EQUAL_THRESH (fast_pow_pos(VEC(2.0f), A),
1596                            mkvec<VEC>(0.5f, 1.0f, 2.0f, 22.62741699796952f), 0.0001f);
1597 
1598     OIIO_CHECK_SIMD_EQUAL (safe_div(mkvec<VEC>(1.0f,2.0f,3.0f,4.0f), mkvec<VEC>(2.0f,0.0f,2.0f,0.0f)),
1599                            mkvec<VEC>(0.5f,0.0f,1.5f,0.0f));
1600     OIIO_CHECK_SIMD_EQUAL (sqrt(mkvec<VEC>(1.0f,4.0f,9.0f,16.0f)), mkvec<VEC>(1.0f,2.0f,3.0f,4.0f));
1601     OIIO_CHECK_SIMD_EQUAL (rsqrt(mkvec<VEC>(1.0f,4.0f,9.0f,16.0f)), VEC(1.0f)/mkvec<VEC>(1.0f,2.0f,3.0f,4.0f));
1602     OIIO_CHECK_SIMD_EQUAL_THRESH (rsqrt_fast(mkvec<VEC>(1.0f,4.0f,9.0f,16.0f)),
1603                                   VEC(1.0f)/mkvec<VEC>(1.0f,2.0f,3.0f,4.0f), 0.0005f);
1604     OIIO_CHECK_SIMD_EQUAL_THRESH (rcp_fast(VEC::Iota(1.0f)),
1605                                   VEC(1.0f)/VEC::Iota(1.0f), 0.0005f);
1606 
1607     benchmark2 ("simd operator/", do_div<VEC>, A, A);
1608     benchmark2 ("simd safe_div", do_safe_div<VEC>, A, A);
1609     benchmark ("simd rcp_fast", [](const VEC& v){ return rcp_fast(v); }, mkvec<VEC>(1.0f,4.0f,9.0f,16.0f));
1610 
1611     OIIO_CHECK_SIMD_EQUAL (ifloor(mkvec<VEC>(0.0f, 0.999f, 1.0f, 1.001f)),
1612                            mkvec<vint_t>(0, 0, 1, 1));
1613     OIIO_CHECK_SIMD_EQUAL (ifloor(mkvec<VEC>(0.0f, -0.999f, -1.0f, -1.001f)),
1614                            mkvec<vint_t>(0, -1, -1, -2));
1615     benchmark ("float ifloor", [](float&v){ return ifloor(v); }, 1.1f);
1616     benchmark ("simd ifloor", [](const VEC&v){ return simd::ifloor(v); }, VEC(1.1f));
1617 
1618     int iscalar;
1619     vint_t ival;
1620     VEC fval = -1.1;
1621     OIIO_CHECK_EQUAL_APPROX (floorfrac(VEC(0.0f),    &ival), 0.0f);   OIIO_CHECK_SIMD_EQUAL (ival, 0);
1622     OIIO_CHECK_EQUAL_APPROX (floorfrac(VEC(-0.999f), &ival), 0.001f); OIIO_CHECK_SIMD_EQUAL (ival, -1);
1623     OIIO_CHECK_EQUAL_APPROX (floorfrac(VEC(-1.0f),   &ival), 0.0f);   OIIO_CHECK_SIMD_EQUAL (ival, -1);
1624     OIIO_CHECK_EQUAL_APPROX (floorfrac(VEC(-1.001f), &ival), 0.999f); OIIO_CHECK_SIMD_EQUAL (ival, -2);
1625     OIIO_CHECK_EQUAL_APPROX (floorfrac(VEC(0.999f),  &ival), 0.999f); OIIO_CHECK_SIMD_EQUAL (ival, 0);
1626     OIIO_CHECK_EQUAL_APPROX (floorfrac(VEC(1.0f),    &ival), 0.0f);   OIIO_CHECK_SIMD_EQUAL (ival, 1);
1627     OIIO_CHECK_EQUAL_APPROX (floorfrac(VEC(1.001f),  &ival), 0.001f); OIIO_CHECK_SIMD_EQUAL (ival, 1);
1628     benchmark ("float floorfrac", [&](float x){ return DoNotOptimize(floorfrac(x,&iscalar)); }, 1.1f);
1629     benchmark ("simd floorfrac", [&](const VEC& x){ return DoNotOptimize(floorfrac(x,&ival)); }, fval);
1630 
1631     benchmark ("float expf", expf, 0.67f);
1632     benchmark ("float fast_exp", fast_exp_float, 0.67f);
1633     benchmark ("simd exp", [](const VEC& v){ return simd::exp(v); }, VEC(0.67f));
1634     benchmark ("simd fast_exp", [](const VEC& v){ return fast_exp(v); }, VEC(0.67f));
1635 
1636     benchmark ("float logf", logf, 0.67f);
1637     benchmark ("fast_log", fast_log_float, 0.67f);
1638     benchmark ("simd log", [](const VEC& v){ return simd::log(v); }, VEC(0.67f));
1639     benchmark ("simd fast_log", fast_log<VEC>, VEC(0.67f));
1640     benchmark2 ("float powf", powf, 0.67f, 0.67f);
1641     benchmark2 ("simd fast_pow_pos", [](const VEC& x,const VEC& y){ return fast_pow_pos(x,y); }, VEC(0.67f), VEC(0.67f));
1642     benchmark ("float sqrt", sqrtf, 4.0f);
1643     benchmark ("simd sqrt", [](const VEC& v){ return sqrt(v); }, mkvec<VEC>(1.0f,4.0f,9.0f,16.0f));
1644     benchmark ("float rsqrt", rsqrtf, 4.0f);
1645     benchmark ("simd rsqrt", [](const VEC& v){ return rsqrt(v); }, mkvec<VEC>(1.0f,4.0f,9.0f,16.0f));
1646     benchmark ("simd rsqrt_fast", [](const VEC& v){ return rsqrt_fast(v); }, mkvec<VEC>(1.0f,4.0f,9.0f,16.0f));
1647 }
1648 
1649 
1650 
test_metaprogramming()1651 void test_metaprogramming ()
1652 {
1653     test_heading ("metaprogramming");
1654     OIIO_CHECK_EQUAL (SimdSize<vfloat4>::size, 4);
1655     OIIO_CHECK_EQUAL (SimdSize<vfloat3>::size, 4);
1656     OIIO_CHECK_EQUAL (SimdSize<vint4>::size, 4);
1657     OIIO_CHECK_EQUAL (SimdSize<vbool4>::size, 4);
1658     OIIO_CHECK_EQUAL (SimdSize<vfloat8>::size, 8);
1659     OIIO_CHECK_EQUAL (SimdSize<vint8>::size, 8);
1660     OIIO_CHECK_EQUAL (SimdSize<vbool8>::size, 8);
1661     OIIO_CHECK_EQUAL (SimdSize<vfloat16>::size, 16);
1662     OIIO_CHECK_EQUAL (SimdSize<vint16>::size, 16);
1663     OIIO_CHECK_EQUAL (SimdSize<vbool16>::size, 16);
1664     OIIO_CHECK_EQUAL (SimdSize<float>::size, 1);
1665     OIIO_CHECK_EQUAL (SimdSize<int>::size, 1);
1666     OIIO_CHECK_EQUAL (SimdSize<bool>::size, 1);
1667 
1668     OIIO_CHECK_EQUAL (SimdElements<vfloat4>::size, 4);
1669     OIIO_CHECK_EQUAL (SimdElements<vfloat3>::size, 3);
1670     OIIO_CHECK_EQUAL (SimdElements<vint4>::size, 4);
1671     OIIO_CHECK_EQUAL (SimdElements<vbool4>::size, 4);
1672     OIIO_CHECK_EQUAL (SimdElements<vfloat8>::size, 8);
1673     OIIO_CHECK_EQUAL (SimdElements<vint8>::size, 8);
1674     OIIO_CHECK_EQUAL (SimdElements<vbool8>::size, 8);
1675     OIIO_CHECK_EQUAL (SimdElements<vfloat16>::size, 16);
1676     OIIO_CHECK_EQUAL (SimdElements<vint16>::size, 16);
1677     OIIO_CHECK_EQUAL (SimdElements<vbool16>::size, 16);
1678     OIIO_CHECK_EQUAL (SimdElements<float>::size, 1);
1679     OIIO_CHECK_EQUAL (SimdElements<int>::size, 1);
1680     OIIO_CHECK_EQUAL (SimdElements<bool>::size, 1);
1681 
1682     OIIO_CHECK_EQUAL (vfloat4::elements, 4);
1683     OIIO_CHECK_EQUAL (vfloat3::elements, 3);
1684     OIIO_CHECK_EQUAL (vint4::elements, 4);
1685     OIIO_CHECK_EQUAL (vbool4::elements, 4);
1686     // OIIO_CHECK_EQUAL (vfloat8::elements, 8);
1687     OIIO_CHECK_EQUAL (vint8::elements, 8);
1688     OIIO_CHECK_EQUAL (vbool8::elements, 8);
1689     OIIO_CHECK_EQUAL (vfloat16::elements, 16);
1690     OIIO_CHECK_EQUAL (vint16::elements, 16);
1691     OIIO_CHECK_EQUAL (vbool16::elements, 16);
1692 
1693     // Make sure that VTYPE::value_t returns the right element type
1694     OIIO_CHECK_ASSERT((std::is_same<vfloat4::value_t, float>::value));
1695     OIIO_CHECK_ASSERT((std::is_same<vfloat3::value_t, float>::value));
1696     OIIO_CHECK_ASSERT((std::is_same<vfloat8::value_t, float>::value));
1697     OIIO_CHECK_ASSERT((std::is_same<vfloat16::value_t, float>::value));
1698     OIIO_CHECK_ASSERT((std::is_same<vint4::value_t, int>::value));
1699     OIIO_CHECK_ASSERT((std::is_same<vint8::value_t, int>::value));
1700     OIIO_CHECK_ASSERT((std::is_same<vint16::value_t, int>::value));
1701     OIIO_CHECK_ASSERT((std::is_same<vbool4::value_t, bool>::value));
1702     OIIO_CHECK_ASSERT((std::is_same<vbool8::value_t, bool>::value));
1703     OIIO_CHECK_ASSERT((std::is_same<vbool16::value_t, bool>::value));
1704 
1705     // Make sure that VTYPE::vfloat_t returns the same-sized float type
1706     OIIO_CHECK_ASSERT((std::is_same<vfloat4::vfloat_t, vfloat4>::value));
1707     OIIO_CHECK_ASSERT((std::is_same<vfloat8::vfloat_t, vfloat8>::value));
1708     OIIO_CHECK_ASSERT((std::is_same<vfloat16::vfloat_t, vfloat16>::value));
1709     OIIO_CHECK_ASSERT((std::is_same<vint4::vfloat_t, vfloat4>::value));
1710     OIIO_CHECK_ASSERT((std::is_same<vint8::vfloat_t, vfloat8>::value));
1711     OIIO_CHECK_ASSERT((std::is_same<vint16::vfloat_t, vfloat16>::value));
1712 
1713     // Make sure that VTYPE::vint_t returns the same-sized int type
1714     OIIO_CHECK_ASSERT((std::is_same<vfloat4::vint_t, vint4>::value));
1715     OIIO_CHECK_ASSERT((std::is_same<vfloat8::vint_t, vint8>::value));
1716     OIIO_CHECK_ASSERT((std::is_same<vfloat16::vint_t, vint16>::value));
1717     OIIO_CHECK_ASSERT((std::is_same<vint4::vint_t, vint4>::value));
1718     OIIO_CHECK_ASSERT((std::is_same<vint8::vint_t, vint8>::value));
1719     OIIO_CHECK_ASSERT((std::is_same<vint16::vint_t, vint16>::value));
1720 
1721     // Make sure that VTYPE::vbool_t returns the same-sized bool type
1722     OIIO_CHECK_ASSERT((std::is_same<vfloat4::vbool_t, vbool4>::value));
1723     OIIO_CHECK_ASSERT((std::is_same<vfloat8::vbool_t, vbool8>::value));
1724     OIIO_CHECK_ASSERT((std::is_same<vfloat16::vbool_t, vbool16>::value));
1725     OIIO_CHECK_ASSERT((std::is_same<vint4::vbool_t, vbool4>::value));
1726     OIIO_CHECK_ASSERT((std::is_same<vint8::vbool_t, vbool8>::value));
1727     OIIO_CHECK_ASSERT((std::is_same<vint16::vbool_t, vbool16>::value));
1728 }
1729 
1730 
1731 
1732 // Transform a point by a matrix using regular Imath
1733 inline Imath::V3f
transformp_imath(const Imath::V3f & v,const Imath::M44f & m)1734 transformp_imath(const Imath::V3f& v, const Imath::M44f& m)
1735 {
1736     Imath::V3f r;
1737     m.multVecMatrix(v, r);
1738     return r;
1739 }
1740 
1741 // Transform a point by a matrix using simd ops on Imath types.
1742 inline Imath::V3f
transformp_imath_simd(const Imath::V3f & v,const Imath::M44f & m)1743 transformp_imath_simd(const Imath::V3f& v, const Imath::M44f& m)
1744 {
1745     return simd::transformp(m, v).V3f();
1746 }
1747 
1748 // Transform a simd point by an Imath matrix using SIMD
1749 inline vfloat3
transformp_simd(const vfloat3 & v,const Imath::M44f & m)1750 transformp_simd(const vfloat3& v, const Imath::M44f& m)
1751 {
1752     return simd::transformp(m, v);
1753 }
1754 
1755 // Transform a point by a matrix using regular Imath
1756 inline Imath::V3f
transformv_imath(const Imath::V3f & v,const Imath::M44f & m)1757 transformv_imath(const Imath::V3f& v, const Imath::M44f& m)
1758 {
1759     Imath::V3f r;
1760     m.multDirMatrix(v, r);
1761     return r;
1762 }
1763 
1764 inline Imath::V4f
mul_vm_imath(const Imath::V4f & v,const Imath::M44f & m)1765 mul_vm_imath(const Imath::V4f& v, const Imath::M44f& m)
1766 {
1767     return v*m;
1768 }
1769 
1770 // inline Imath::V4f
1771 // mul_mv_imath(const Imath::M44f& m, const Imath::V4f& v)
1772 // {
1773 //     return m*v;
1774 // }
1775 
1776 inline vfloat4
mul_vm_simd(const vfloat4 & v,const matrix44 & m)1777 mul_vm_simd(const vfloat4& v, const matrix44& m)
1778 {
1779     return v*m;
1780 }
1781 
1782 inline vfloat4
mul_mv_simd(const matrix44 & m,const vfloat4 v)1783 mul_mv_simd(const matrix44& m, const vfloat4 v)
1784 {
1785     return m*v;
1786 }
1787 
1788 
1789 
1790 inline bool
mx_equal_thresh(const matrix44 & a,const matrix44 & b,float thresh)1791 mx_equal_thresh(const matrix44& a, const matrix44& b, float thresh)
1792 {
1793     for (int j = 0; j < 4; ++j)
1794         for (int i = 0; i < 4; ++i)
1795             if (fabsf(a[j][i] - b[j][i]) > thresh)
1796                 return false;
1797     return true;
1798 }
1799 
1800 
1801 
1802 inline Imath::M44f
mat_transpose(const Imath::M44f & m)1803 mat_transpose(const Imath::M44f& m)
1804 {
1805     return m.transposed();
1806 }
1807 
1808 inline Imath::M44f
mat_transpose_simd(const Imath::M44f & m)1809 mat_transpose_simd(const Imath::M44f& m)
1810 {
1811     return matrix44(m).transposed().M44f();
1812 }
1813 
1814 
1815 
1816 void
test_matrix()1817 test_matrix()
1818 {
1819     Imath::V3f P(1.0f, 0.0f, 0.0f);
1820     Imath::M44f Mtrans(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 10, 11, 12, 1);
1821     Imath::M44f Mrot = Imath::M44f().rotate(Imath::V3f(0.0f, M_PI_2, 0.0f));
1822 
1823     test_heading("Testing matrix ops:");
1824     std::cout << "  P = " << P << "\n";
1825     std::cout << "  Mtrans = " << Mtrans << "\n";
1826     std::cout << "  Mrot   = " << Mrot << "\n";
1827     OIIO_CHECK_EQUAL(simd::transformp(Mtrans, P).V3f(),
1828                      transformp_imath(P, Mtrans));
1829     std::cout << "  P translated = " << simd::transformp(Mtrans, P) << "\n";
1830     OIIO_CHECK_EQUAL(simd::transformv(Mtrans, P).V3f(), P);
1831     OIIO_CHECK_EQUAL(simd::transformp(Mrot, P).V3f(),
1832                      transformp_imath(P, Mrot));
1833     std::cout << "  P rotated = " << simd::transformp(Mrot, P) << "\n";
1834     OIIO_CHECK_EQUAL(simd::transformvT(Mrot, P).V3f(),
1835                      transformv_imath(P, Mrot.transposed()));
1836     std::cout << "  P rotated by the transpose = " << simd::transformv(Mrot, P)
1837               << "\n";
1838     OIIO_CHECK_EQUAL(matrix44(Mrot).transposed().M44f(), Mrot.transposed());
1839     std::cout << "  Mrot transposed = " << matrix44(Mrot).transposed().M44f()
1840               << "\n";
1841 
1842     // Test m44 * v4, v4 * m44
1843     {
1844         Imath::M44f M(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16);
1845         matrix44 m(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16);
1846         Imath::V4f V(1,2,3,4);
1847         vfloat4 v(1,2,3,4);
1848         vfloat4 vm = v*m;
1849         OIIO_CHECK_SIMD_EQUAL(vm, vfloat4(V*M));
1850         // vfloat4 mv = m*v;
1851         // OIIO_CHECK_SIMD_EQUAL(mv, M*V);
1852         benchmark2("V4 * M44 Imath", mul_vm_imath, V, M, 1);
1853         // benchmark2("M44 * V4 Imath", mul_mv_imath, mx, v4x, 1);
1854         benchmark2("M44 * V4 simd", mul_mv_simd, m, v, 1);
1855         benchmark2("V4 * M44 simd", mul_vm_simd, v, m, 1);
1856     }
1857 
1858     // Test ==, !=
1859     {
1860         matrix44 mt(Mtrans), mr(Mrot);
1861         OIIO_CHECK_EQUAL(mt, mt);
1862         OIIO_CHECK_EQUAL(mt, Mtrans);
1863         OIIO_CHECK_EQUAL(Mtrans, mt);
1864         OIIO_CHECK_NE(mt, mr);
1865         OIIO_CHECK_NE(mr, Mtrans);
1866         OIIO_CHECK_NE(Mtrans, mr);
1867     }
1868     OIIO_CHECK_ASSERT(
1869         mx_equal_thresh(matrix44(Mtrans.inverse()), matrix44(Mtrans).inverse(),
1870                         1.0e-6f));
1871     OIIO_CHECK_ASSERT(
1872         mx_equal_thresh(matrix44(Mrot.inverse()), matrix44(Mrot).inverse(),
1873                         1.0e-6f));
1874     OIIO_CHECK_EQUAL(
1875         matrix44(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
1876         Imath::M44f(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
1877 
1878     Imath::V3f vx(2.51f, 1.0f, 1.0f);
1879     Imath::M44f mx(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 10, 11, 12, 1);
1880     benchmark2("transformp Imath", transformp_imath, vx, mx, 1);
1881     benchmark2("transformp Imath with simd", transformp_imath_simd, vx, mx, 1);
1882     benchmark2("transformp simd", transformp_simd, vfloat3(vx), mx, 1);
1883 
1884     benchmark("transpose m44", mat_transpose, mx, 1);
1885     benchmark("transpose m44 with simd", mat_transpose_simd, mx, 1);
1886     // Reduce the iterations of the ones below, if we can
1887     iterations /= 2;
1888     benchmark("m44 inverse Imath", inverse_imath, mx, 1);
1889     // std::cout << "inv " << matrix44(inverse_imath(mx)) << "\n";
1890     benchmark("m44 inverse_simd", inverse_simd, matrix44(mx), 1);
1891     // std::cout << "inv " << inverse_simd(mx) << "\n";
1892     benchmark("m44 inverse_simd native simd", inverse_simd, matrix44(mx), 1);
1893     // std::cout << "inv " << inverse_simd(mx) << "\n";
1894     iterations *= 2;  // put things the way they were
1895 }
1896 
1897 
1898 
1899 int
main(int argc,char * argv[])1900 main(int argc, char* argv[])
1901 {
1902 #if !defined(NDEBUG) || defined(OIIO_CI) || defined(OIIO_CODE_COVERAGE)
1903     // For the sake of test time, reduce the default iterations for DEBUG,
1904     // CI, and code coverage builds. Explicit use of --iters or --trials
1905     // will override this, since it comes before the getargs() call.
1906     iterations /= 10;
1907     ntrials = 1;
1908 #endif
1909     for (int i = 0; i < 16; ++i) {
1910         dummy_float[i] = 1.0f;
1911         dummy_int[i]   = 1;
1912     }
1913 
1914     getargs(argc, argv);
1915 
1916     std::string oiiosimd = OIIO::get_string_attribute("oiio:simd");
1917     std::string hwsimd   = OIIO::get_string_attribute("hw:simd");
1918     std::cout << "OIIO SIMD support is: " << (oiiosimd.size() ? oiiosimd : "")
1919               << "\n";
1920     std::cout << "Hardware SIMD support is: " << (hwsimd.size() ? hwsimd : "")
1921               << "\n";
1922     std::cout << "\n";
1923 
1924     Timer timer;
1925 
1926     vint4 dummy4(0);
1927     vint8 dummy8(0);
1928     benchmark("null benchmark 4", [](const vint4&) { return int(0); }, dummy4);
1929     benchmark("null benchmark 8", [](const vint8&) { return int(0); }, dummy8);
1930 
1931     category_heading("vfloat4");
1932     test_loadstore<vfloat4>();
1933     test_conversion_loadstore_float<vfloat4>();
1934     test_masked_loadstore<vfloat4>();
1935     test_gatherscatter<vfloat4>();
1936     test_component_access<vfloat4>();
1937     test_arithmetic<vfloat4>();
1938     test_comparisons<vfloat4>();
1939     test_shuffle4<vfloat4>();
1940     test_swizzle<vfloat4>();
1941     test_blend<vfloat4>();
1942     test_transpose4<vfloat4>();
1943     test_vectorops_vfloat4();
1944     test_fused<vfloat4>();
1945     test_mathfuncs<vfloat4>();
1946 
1947     category_heading("vfloat3");
1948     test_loadstore<vfloat3>();
1949     test_conversion_loadstore_float<vfloat3>();
1950     test_component_access<vfloat3>();
1951     test_arithmetic<vfloat3>();
1952     // Unnecessary to test these, they just use the vfloat4 ops.
1953     // test_comparisons<vfloat3> ();
1954     // test_shuffle4<vfloat3> ();
1955     // test_swizzle<vfloat3> ();
1956     // test_blend<vfloat3> ();
1957     // test_transpose4<vfloat3> ();
1958     test_vectorops_vfloat3();
1959     test_fused<vfloat3>();
1960     // test_mathfuncs<vfloat3>();
1961 
1962     category_heading("vfloat8");
1963     test_loadstore<vfloat8>();
1964     test_conversion_loadstore_float<vfloat8>();
1965     test_masked_loadstore<vfloat8>();
1966     test_gatherscatter<vfloat8>();
1967     test_component_access<vfloat8>();
1968     test_arithmetic<vfloat8>();
1969     test_comparisons<vfloat8>();
1970     test_shuffle8<vfloat8>();
1971     test_blend<vfloat8>();
1972     test_fused<vfloat8>();
1973     test_mathfuncs<vfloat8>();
1974 
1975     category_heading("vfloat16");
1976     test_loadstore<vfloat16>();
1977     test_conversion_loadstore_float<vfloat16>();
1978     test_masked_loadstore<vfloat16>();
1979     test_gatherscatter<vfloat16>();
1980     test_component_access<vfloat16>();
1981     test_arithmetic<vfloat16>();
1982     test_comparisons<vfloat16>();
1983     test_shuffle16<vfloat16>();
1984     test_blend<vfloat16>();
1985     test_fused<vfloat16>();
1986     test_mathfuncs<vfloat16>();
1987 
1988     category_heading("vint4");
1989     test_loadstore<vint4>();
1990     test_conversion_loadstore_int<vint4>();
1991     test_masked_loadstore<vint4>();
1992     test_gatherscatter<vint4>();
1993     test_component_access<vint4>();
1994     test_arithmetic<vint4>();
1995     test_bitwise_int<vint4>();
1996     test_comparisons<vint4>();
1997     test_shuffle4<vint4>();
1998     test_blend<vint4>();
1999     test_vint_to_uint16s<vint4>();
2000     test_vint_to_uint8s<vint4>();
2001     test_shift<vint4>();
2002     test_transpose4<vint4>();
2003 
2004     category_heading("vint8");
2005     test_loadstore<vint8>();
2006     test_conversion_loadstore_int<vint8>();
2007     test_masked_loadstore<vint8>();
2008     test_gatherscatter<vint8>();
2009     test_component_access<vint8>();
2010     test_arithmetic<vint8>();
2011     test_bitwise_int<vint8>();
2012     test_comparisons<vint8>();
2013     test_shuffle8<vint8>();
2014     test_blend<vint8>();
2015     test_vint_to_uint16s<vint8>();
2016     test_vint_to_uint8s<vint8>();
2017     test_shift<vint8>();
2018 
2019     category_heading("vint16");
2020     test_loadstore<vint16>();
2021     test_conversion_loadstore_int<vint16>();
2022     test_masked_loadstore<vint16>();
2023     test_gatherscatter<vint16>();
2024     test_component_access<vint16>();
2025     test_arithmetic<vint16>();
2026     test_bitwise_int<vint16>();
2027     test_comparisons<vint16>();
2028     test_shuffle16<vint16>();
2029     test_blend<vint16>();
2030     test_vint_to_uint16s<vint16>();
2031     test_vint_to_uint16s<vint16>();
2032     test_shift<vint16>();
2033 
2034     category_heading("vbool4");
2035     test_shuffle4<vbool4>();
2036     test_component_access<vbool4>();
2037     test_bitwise_bool<vbool4>();
2038 
2039     category_heading("vbool8");
2040     test_shuffle8<vbool8>();
2041     test_component_access<vbool8>();
2042     test_bitwise_bool<vbool8>();
2043 
2044     category_heading("vbool16");
2045     // test_shuffle16<vbool16> ();
2046     test_component_access<vbool16>();
2047     test_bitwise_bool<vbool16>();
2048 
2049     category_heading("Odds and ends");
2050     test_constants();
2051     test_special();
2052     test_metaprogramming();
2053     test_matrix();
2054 
2055     std::cout << "\nTotal time: " << Strutil::timeintervalformat(timer())
2056               << "\n";
2057 
2058     return unit_test_failures;
2059 }
2060