1 /***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
3 * Martin Renou                                                             *
4 * Copyright (c) QuantStack                                                 *
5 *                                                                          *
6 * Distributed under the terms of the BSD 3-Clause License.                 *
7 *                                                                          *
8 * The full license is in the file LICENSE, distributed with this software. *
9 ****************************************************************************/
10 
11 #ifndef XSIMD_BENCHMARK_HPP
12 #define XSIMD_BENCHMARK_HPP
13 
14 #include <chrono>
15 #include <string>
16 #include <vector>
17 #include <iostream>
18 #include "xsimd/xsimd.hpp"
19 
20 namespace xsimd
21 {
22     template <class T>
23     std::string batch_name();
24 
batch_name()25     template <> inline std::string batch_name<batch<float, 4>>() { return "sse/neon float"; }
batch_name()26     template <> inline std::string batch_name<batch<double, 2>>() { return "sse/neon double"; }
batch_name()27     template <> inline std::string batch_name<batch<float, 8>>() { return "avx float"; }
batch_name()28     template <> inline std::string batch_name<batch<double, 4>>() { return "avx double"; }
batch_name()29     template <> inline std::string batch_name<batch<float, 7>>() { return "fallback float"; }
batch_name()30     template <> inline std::string batch_name<batch<double, 3>>() { return "fallback double"; }
31 
32     using duration_type = std::chrono::duration<double, std::milli>;
33 
34     template <class T>
35     using bench_vector = std::vector<T, xsimd::aligned_allocator<T, XSIMD_DEFAULT_ALIGNMENT>>;
36 
37     template <class T>
init_benchmark(bench_vector<T> & lhs,bench_vector<T> & rhs,bench_vector<T> & res,size_t size)38     void init_benchmark(bench_vector<T>& lhs, bench_vector<T>& rhs, bench_vector<T>& res, size_t size)
39     {
40         lhs.resize(size);
41         rhs.resize(size);
42         res.resize(size);
43         for (size_t i = 0; i < size; ++i)
44         {
45             lhs[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size);
46             rhs[i] = T(10.2) / T(i + 2) + T(0.25);
47         }
48     }
49 
50     template <class T>
init_benchmark(bench_vector<T> & op0,bench_vector<T> & op1,bench_vector<T> & op2,bench_vector<T> & res,size_t size)51     void init_benchmark(bench_vector<T>& op0, bench_vector<T>& op1, bench_vector<T>& op2, bench_vector<T>& res, size_t size)
52     {
53         op0.resize(size);
54         op1.resize(size);
55         op2.resize(size);
56         res.resize(size);
57         for (size_t i = 0; i < size; ++i)
58         {
59             op0[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size);
60             op1[i] = T(10.2) / T(i + 2) + T(0.25);
61             op2[i] = T(20.1) / T(i + 5) + T(0.65);
62         }
63     }
64 
65     template <class T>
init_benchmark_arctrigo(bench_vector<T> & lhs,bench_vector<T> & rhs,bench_vector<T> & res,size_t size)66     void init_benchmark_arctrigo(bench_vector<T>& lhs, bench_vector<T>& rhs, bench_vector<T>& res, size_t size)
67     {
68         lhs.resize(size);
69         rhs.resize(size);
70         res.resize(size);
71         for (size_t i = 0; i < size; ++i)
72         {
73             lhs[i] = T(-1.) + T(2.) * T(i) / T(size);
74             rhs[i] = T(i) / T(i + 2) + T(0.25);
75         }
76     }
77 
78     enum class init_method
79     {
80         classic,
81         arctrigo
82     };
83 
84     template <class F, class V>
benchmark_scalar(F f,V & lhs,V & res,std::size_t number)85     duration_type benchmark_scalar(F f, V& lhs, V& res, std::size_t number)
86     {
87         size_t s = lhs.size();
88         duration_type t_res = duration_type::max();
89         for (std::size_t count = 0; count < number; ++count)
90         {
91             auto start = std::chrono::steady_clock::now();
92             for (size_t i = 0; i < s; ++i)
93             {
94                 res[i] = f(lhs[i]);
95             }
96             auto end = std::chrono::steady_clock::now();
97             auto tmp = end - start;
98             t_res = tmp < t_res ? tmp : t_res;
99         }
100         return t_res;
101     }
102 
103     template <class F, class V>
benchmark_scalar(F f,V & lhs,V & rhs,V & res,std::size_t number)104     duration_type benchmark_scalar(F f, V& lhs, V& rhs, V& res, std::size_t number)
105     {
106         size_t s = lhs.size();
107         duration_type t_res = duration_type::max();
108         for (std::size_t count = 0; count < number; ++count)
109         {
110             auto start = std::chrono::steady_clock::now();
111             for (size_t i = 0; i < s; ++i)
112             {
113                res[i] = f(lhs[i], rhs[i]);
114             }
115             auto end = std::chrono::steady_clock::now();
116             auto tmp = end - start;
117             t_res = tmp < t_res ? tmp : t_res;
118         }
119         return t_res;
120     }
121 
122     template <class F, class V>
benchmark_scalar(F f,V & op0,V & op1,V & op2,V & res,std::size_t number)123     duration_type benchmark_scalar(F f, V& op0, V& op1, V& op2, V& res, std::size_t number)
124     {
125         size_t s = op0.size();
126         duration_type t_res = duration_type::max();
127         for (std::size_t count = 0; count < number; ++count)
128         {
129             auto start = std::chrono::steady_clock::now();
130             for (size_t i = 0; i < s; ++i)
131             {
132                res[i] = f(op0[i], op1[i], op2[i]);
133             }
134             auto end = std::chrono::steady_clock::now();
135             auto tmp = end - start;
136             t_res = tmp < t_res ? tmp : t_res;
137         }
138         return t_res;
139     }
140 
141     template <class B, class F, class V>
benchmark_simd(F f,V & lhs,V & res,std::size_t number)142     duration_type benchmark_simd(F f, V& lhs, V& res, std::size_t number)
143     {
144         std::size_t s = lhs.size();
145         duration_type t_res = duration_type::max();
146         for (std::size_t count = 0; count < number; ++count)
147         {
148             auto start = std::chrono::steady_clock::now();
149             for (std::size_t i = 0; i <= (s - B::size); i += B::size)
150             {
151                 B blhs(&lhs[i], aligned_mode());
152                 B bres = f(blhs);
153                 bres.store_aligned(&res[i]);
154             }
155             auto end = std::chrono::steady_clock::now();
156             auto tmp = end - start;
157             t_res = tmp < t_res ? tmp : t_res;
158         }
159         return t_res;
160     }
161 
162     template <class B, class F, class V>
benchmark_simd_unrolled(F f,V & lhs,V & res,std::size_t number)163     duration_type benchmark_simd_unrolled(F f, V& lhs, V& res, std::size_t number)
164     {
165         std::size_t s = lhs.size();
166         std::size_t inc = 4 * B::size;
167         duration_type t_res = duration_type::max();
168         for (std::size_t count = 0; count < number; ++count)
169         {
170             auto start = std::chrono::steady_clock::now();
171             for (std::size_t i = 0; i <= (s - inc); i += inc)
172             {
173                 size_t j = i + B::size;
174                 size_t k = j + B::size;
175                 size_t l = k + B::size;
176                 B blhs(&lhs[i], aligned_mode()), blhs2(&lhs[j], aligned_mode()),
177                   blhs3(&lhs[k], aligned_mode()), blhs4(&lhs[l], aligned_mode());
178                 B bres = f(blhs);
179                 B bres2 = f(blhs2);
180                 B bres3 = f(blhs3);
181                 B bres4 = f(blhs4);
182                 bres.store_aligned(&res[i]);
183                 bres2.store_aligned(&res[j]);
184                 bres3.store_aligned(&res[k]);
185                 bres4.store_aligned(&res[l]);
186             }
187             auto end = std::chrono::steady_clock::now();
188             auto tmp = end - start;
189             t_res = tmp < t_res ? tmp : t_res;
190         }
191         return t_res;
192     }
193 
194     template <class B, class F, class V>
benchmark_simd(F f,V & lhs,V & rhs,V & res,std::size_t number)195     duration_type benchmark_simd(F f, V& lhs, V& rhs, V& res, std::size_t number)
196     {
197         std::size_t s = lhs.size();
198         duration_type t_res = duration_type::max();
199         for (std::size_t count = 0; count < number; ++count)
200         {
201             auto start = std::chrono::steady_clock::now();
202             for (std::size_t i = 0; i <= (s - B::size); i += B::size)
203             {
204                 B blhs(&lhs[i], aligned_mode()), brhs(&rhs[i], aligned_mode());
205                 B bres = f(blhs, brhs);
206                 bres.store_aligned(&res[i]);
207             }
208             auto end = std::chrono::steady_clock::now();
209             auto tmp = end - start;
210             t_res = tmp < t_res ? tmp : t_res;
211         }
212         return t_res;
213     }
214 
215     template <class B, class F, class V>
benchmark_simd_unrolled(F f,V & lhs,V & rhs,V & res,std::size_t number)216     duration_type benchmark_simd_unrolled(F f, V& lhs, V& rhs, V& res, std::size_t number)
217     {
218         std::size_t s = lhs.size();
219         std::size_t inc = 4 * B::size;
220         duration_type t_res = duration_type::max();
221         for (std::size_t count = 0; count < number; ++count)
222         {
223             auto start = std::chrono::steady_clock::now();
224             for (std::size_t i = 0; i <= (s - inc); i += inc)
225             {
226                 size_t j = i + B::size;
227                 size_t k = j + B::size;
228                 size_t l = k + B::size;
229                 B blhs(&lhs[i], aligned_mode()), brhs(&rhs[i], aligned_mode()),
230                   blhs2(&lhs[j], aligned_mode()), brhs2(&rhs[j], aligned_mode());
231                 B blhs3(&lhs[k], aligned_mode()), brhs3(&rhs[k], aligned_mode()),
232                   blhs4(&lhs[l], aligned_mode()), brhs4(&rhs[l], aligned_mode());
233                 B bres = f(blhs, brhs);
234                 B bres2 = f(blhs2, brhs2);
235                 B bres3 = f(blhs3, brhs3);
236                 B bres4 = f(blhs4, brhs4);
237                 bres.store_aligned(&res[i]);
238                 bres2.store_aligned(&res[j]);
239                 bres3.store_aligned(&res[k]);
240                 bres4.store_aligned(&res[l]);
241             }
242             auto end = std::chrono::steady_clock::now();
243             auto tmp = end - start;
244             t_res = tmp < t_res ? tmp : t_res;
245         }
246         return t_res;
247     }
248 
249 
250     template <class B, class F, class V>
benchmark_simd(F f,V & op0,V & op1,V & op2,V & res,std::size_t number)251     duration_type benchmark_simd(F f, V& op0, V& op1, V& op2, V& res, std::size_t number)
252     {
253         std::size_t s = op0.size();
254         duration_type t_res = duration_type::max();
255         for (std::size_t count = 0; count < number; ++count)
256         {
257             auto start = std::chrono::steady_clock::now();
258             for (std::size_t i = 0; i <= (s - B::size); i += B::size)
259             {
260                 B bop0(&op0[i], aligned_mode()),
261                   bop1(&op1[i], aligned_mode()),
262                   bop2(&op2[i], aligned_mode());
263                 B bres = f(bop0, bop1, bop2);
264                 bres.store_aligned(&res[i]);
265             }
266             auto end = std::chrono::steady_clock::now();
267             auto tmp = end - start;
268             t_res = tmp < t_res ? tmp : t_res;
269         }
270         return t_res;
271     }
272 
273     template <class B, class F, class V>
benchmark_simd_unrolled(F f,V & op0,V & op1,V & op2,V & res,std::size_t number)274     duration_type benchmark_simd_unrolled(F f, V& op0, V& op1, V& op2, V& res, std::size_t number)
275     {
276         std::size_t s = op0.size();
277         std::size_t inc = 4 * B::size;
278         duration_type t_res = duration_type::max();
279         for (std::size_t count = 0; count < number; ++count)
280         {
281             auto start = std::chrono::steady_clock::now();
282             for (std::size_t i = 0; i <= (s - inc); i += inc)
283             {
284                 size_t j = i + B::size;
285                 size_t k = j + B::size;
286                 size_t l = k + B::size;
287                 B bop0_i(&op0[i], aligned_mode()), bop1_i(&op1[i], aligned_mode()), bop2_i(&op2[i], aligned_mode());
288                 B bop0_j(&op0[j], aligned_mode()), bop1_j(&op1[j], aligned_mode()), bop2_j(&op2[j], aligned_mode());
289                 B bop0_k(&op0[k], aligned_mode()), bop1_k(&op1[k], aligned_mode()), bop2_k(&op2[k], aligned_mode());
290                 B bop0_l(&op0[l], aligned_mode()), bop1_l(&op1[l], aligned_mode()), bop2_l(&op2[l], aligned_mode());
291                 B bres_i = f(bop0_i, bop1_i, bop2_i);
292                 B bres_j = f(bop0_j, bop1_j, bop2_j);
293                 B bres_k = f(bop0_k, bop1_k, bop2_k);
294                 B bres_l = f(bop0_l, bop1_l, bop2_l);
295                 bres_i.store_aligned(&res[i]);
296                 bres_j.store_aligned(&res[j]);
297                 bres_k.store_aligned(&res[k]);
298                 bres_l.store_aligned(&res[l]);
299             }
300             auto end = std::chrono::steady_clock::now();
301             auto tmp = end - start;
302             t_res = tmp < t_res ? tmp : t_res;
303         }
304         return t_res;
305     }
306 
307     template <class F, class OS>
run_benchmark_1op(F f,OS & out,std::size_t size,std::size_t iter,init_method init=init_method::classic)308     void run_benchmark_1op(F f, OS& out, std::size_t size, std::size_t iter, init_method init = init_method::classic)
309     {
310         bench_vector<float> f_lhs, f_rhs, f_res;
311         bench_vector<double> d_lhs, d_rhs, d_res;
312 
313         switch (init)
314         {
315         case init_method::classic:
316             init_benchmark(f_lhs, f_rhs, f_res, size);
317             init_benchmark(d_lhs, d_rhs, d_res, size);
318             break;
319         case init_method::arctrigo:
320             init_benchmark_arctrigo(f_lhs, f_rhs, f_res, size);
321             init_benchmark_arctrigo(d_lhs, d_rhs, d_res, size);
322             break;
323         default:
324             init_benchmark(f_lhs, f_rhs, f_res, size);
325             init_benchmark(d_lhs, d_rhs, d_res, size);
326             break;
327         }
328 
329 #ifndef XSIMD_POLY_BENCHMARKS
330         duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_res, iter);
331         duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_res, iter);
332 #endif
333 
334 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
335         duration_type t_float_sse = benchmark_simd<batch<float, 4>>(f, f_lhs, f_res, iter);
336         duration_type t_float_sse_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_lhs, f_res, iter);
337         duration_type t_double_sse = benchmark_simd<batch<double, 2>>(f, d_lhs, d_res, iter);
338         duration_type t_double_sse_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_lhs, d_res, iter);
339 #endif
340 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
341         duration_type t_float_avx = benchmark_simd<batch<float, 8>>(f, f_lhs, f_res, iter);
342         duration_type t_float_avx_u = benchmark_simd_unrolled<batch<float, 8>>(f, f_lhs, f_res, iter);
343         duration_type t_double_avx = benchmark_simd<batch<double, 4>>(f, d_lhs, d_res, iter);
344         duration_type t_double_avx_u = benchmark_simd_unrolled<batch<double, 4>>(f, d_lhs, d_res, iter);
345 #endif
346 #if defined(XSIMD_ARM_INSTR_SET)
347         duration_type t_float_neon = benchmark_simd<batch<float, 4>>(f, f_lhs, f_res, iter);
348         duration_type t_float_neon_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_lhs, f_res, iter);
349         duration_type t_double_neon = benchmark_simd<batch<double, 2>>(f, d_lhs, d_res, iter);
350         duration_type t_double_neon_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_lhs, d_res, iter);
351 #endif
352 #if defined(XSIMD_ENABLE_FALLBACK)
353         duration_type t_float_fallback = benchmark_simd<batch<float, 7>>(f, f_lhs, f_res, iter);
354         duration_type t_float_fallback_u = benchmark_simd_unrolled<batch<float, 7>>(f, f_lhs, f_res, iter);
355         duration_type t_double_fallback = benchmark_simd<batch<double, 3>>(f, d_lhs, d_res, iter);
356         duration_type t_double_fallback_u = benchmark_simd_unrolled<batch<double, 3>>(f, d_lhs, d_res, iter);
357 #endif
358 
359         out << "============================" << std::endl;
360         out << f.name() << std::endl;
361 #ifndef XSIMD_POLY_BENCHMARKS
362         out << "scalar float   : " << t_float_scalar.count() << "ms" << std::endl;
363 #endif
364 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
365         out << "sse float      : " << t_float_sse.count() << "ms" << std::endl;
366         out << "sse float unr  : " << t_float_sse_u.count() << "ms" << std::endl;
367 #endif
368 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
369         out << "avx float      : " << t_float_avx.count() << "ms" << std::endl;
370         out << "avx float unr  : " << t_float_avx_u.count() << "ms" << std::endl;
371 #endif
372 #if defined(XSIMD_ARM_INSTR_SET)
373         out << "neon float     : " << t_float_neon.count() << "ms" << std::endl;
374         out << "neon float unr : " << t_float_neon_u.count() << "ms" << std::endl;
375 #endif
376 #if defined(XSIMD_ENABLE_FALLBACK)
377         out << "flbk float     : " << t_float_fallback.count() << "ms" << std::endl;
378         out << "flbk float unr : " << t_float_fallback_u.count() << "ms" << std::endl;
379 #endif
380 #ifndef XSIMD_POLY_BENCHMARKS
381         out << "scalar double  : " << t_double_scalar.count() << "ms" << std::endl;
382 #endif
383 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
384         out << "sse double     : " << t_double_sse.count() << "ms" << std::endl;
385         out << "sse double unr : " << t_double_sse_u.count() << "ms" << std::endl;
386 #endif
387 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
388         out << "avx double     : " << t_double_avx.count() << "ms" << std::endl;
389         out << "avx double unr : " << t_double_avx_u.count() << "ms" << std::endl;
390 #endif
391 #if defined(XSIMD_ARM_INSTR_SET)
392         out << "neon double    : " << t_double_neon.count() << "ms" << std::endl;
393         out << "neon double unr: " << t_double_neon_u.count() << "ms" << std::endl;
394 #endif
395 #if defined(XSIMD_ENABLE_FALLBACK)
396         out << "flbk double    : " << t_double_fallback.count() << "ms" << std::endl;
397         out << "flbk double unr: " << t_double_fallback_u.count() << "ms" << std::endl;
398 #endif
399         out << "============================" << std::endl;
400     }
401 
402     template <class F, class OS>
run_benchmark_2op(F f,OS & out,std::size_t size,std::size_t iter)403     void run_benchmark_2op(F f, OS& out, std::size_t size, std::size_t iter)
404     {
405         bench_vector<float> f_lhs, f_rhs, f_res;
406         bench_vector<double> d_lhs, d_rhs, d_res;
407 
408         init_benchmark(f_lhs, f_rhs, f_res, size);
409         init_benchmark(d_lhs, d_rhs, d_res, size);
410 
411         duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_rhs, f_res, iter);
412 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
413         duration_type t_float_sse = benchmark_simd<batch<float, 4>>(f, f_lhs, f_rhs, f_res, iter);
414         duration_type t_float_sse_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_lhs, f_rhs, f_res, iter);
415 #endif
416 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
417         duration_type t_float_avx = benchmark_simd<batch<float, 8>>(f, f_lhs, f_rhs, f_res, iter);
418         duration_type t_float_avx_u = benchmark_simd_unrolled<batch<float, 8>>(f, f_lhs, f_rhs, f_res, iter);
419 #endif
420         duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_rhs, d_res, iter);
421 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
422         duration_type t_double_sse = benchmark_simd<batch<double, 2>>(f, d_lhs, d_rhs, d_res, iter);
423         duration_type t_double_sse_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_lhs, d_rhs, d_res, iter);
424 #endif
425 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
426         duration_type t_double_avx = benchmark_simd<batch<double, 4>>(f, d_lhs, d_rhs, d_res, iter);
427         duration_type t_double_avx_u = benchmark_simd_unrolled<batch<double, 4>>(f, d_lhs, d_rhs, d_res, iter);
428 #endif
429 #if defined(XSIMD_ARM_INSTR_SET)
430         duration_type t_float_neon = benchmark_simd<batch<float, 4>>(f, f_lhs, f_rhs, f_res, iter);
431         duration_type t_float_neon_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_lhs, f_rhs, f_res, iter);
432         duration_type t_double_neon = benchmark_simd<batch<double, 2>>(f, d_lhs, d_rhs, d_res, iter);
433         duration_type t_double_neon_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_lhs, d_rhs, d_res, iter);
434 #endif
435 #if defined(XSIMD_ENABLE_FALLBACK)
436         duration_type t_float_fallback = benchmark_simd<batch<float, 7>>(f, f_lhs, f_rhs, f_res, iter);
437         duration_type t_float_fallback_u = benchmark_simd_unrolled<batch<float, 7>>(f, f_lhs, f_rhs, f_res, iter);
438         duration_type t_double_fallback = benchmark_simd<batch<double, 3>>(f, d_lhs, d_rhs, d_res, iter);
439         duration_type t_double_fallback_u = benchmark_simd_unrolled<batch<double, 3>>(f, d_lhs, d_rhs, d_res, iter);
440 #endif
441 
442         out << "============================" << std::endl;
443         out << f.name() << std::endl;
444         out << "scalar float   : " << t_float_scalar.count() << "ms" << std::endl;
445 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
446         out << "sse float      : " << t_float_sse.count() << "ms" << std::endl;
447         out << "sse float unr  : " << t_float_sse_u.count() << "ms" << std::endl;
448 #endif
449 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
450         out << "avx float      : " << t_float_avx.count() << "ms" << std::endl;
451         out << "avx float unr  : " << t_float_avx_u.count() << "ms" << std::endl;
452 #endif
453 #if defined(XSIMD_ARM_INSTR_SET)
454         out << "neon float     : " << t_float_neon.count() << "ms" << std::endl;
455         out << "neon float unr : " << t_float_neon_u.count() << "ms" << std::endl;
456 #endif
457 #if defined(XSIMD_ENABLE_FALLBACK)
458         out << "flbk float     : " << t_float_fallback.count() << "ms" << std::endl;
459         out << "flbk float unr : " << t_float_fallback_u.count() << "ms" << std::endl;
460 #endif
461         out << "scalar double  : " << t_double_scalar.count() << "ms" << std::endl;
462 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
463         out << "sse double     : " << t_double_sse.count() << "ms" << std::endl;
464         out << "sse double unr : " << t_double_sse_u.count() << "ms" << std::endl;
465 #endif
466 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
467         out << "avx double     : " << t_double_avx.count() << "ms" << std::endl;
468         out << "avx double unr : " << t_double_avx_u.count() << "ms" << std::endl;
469 #endif
470 #if defined(XSIMD_ARM_INSTR_SET)
471         out << "neon double    : " << t_double_neon.count() << "ms" << std::endl;
472         out << "neon double unr: " << t_double_neon_u.count() << "ms" << std::endl;
473 #endif
474 #if defined(XSIMD_ENABLE_FALLBACK)
475         out << "flbk double    : " << t_double_fallback.count() << "ms" << std::endl;
476         out << "flbk double unr: " << t_double_fallback_u.count() << "ms" << std::endl;
477 #endif
478         out << "============================" << std::endl;
479     }
480 
481     template <class F, class OS>
run_benchmark_3op(F f,OS & out,std::size_t size,std::size_t iter)482     void run_benchmark_3op(F f, OS& out, std::size_t size, std::size_t iter)
483     {
484         bench_vector<float> f_op0, f_op1, f_op2, f_res;
485         bench_vector<double> d_op0, d_op1, d_op2, d_res;
486 
487         init_benchmark(f_op0, f_op1, f_op2, f_res, size);
488         init_benchmark(d_op0, d_op1, d_op2, d_res, size);
489 
490         duration_type t_float_scalar = benchmark_scalar(f, f_op0, f_op1, f_op2, f_res, iter);
491 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
492         duration_type t_float_sse = benchmark_simd<batch<float, 4>>(f, f_op0, f_op1, f_op2, f_res, iter);
493         duration_type t_float_sse_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_op0, f_op1, f_op2, f_res, iter);
494 #endif
495 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
496         duration_type t_float_avx = benchmark_simd<batch<float, 8>>(f, f_op0, f_op1, f_op2, f_res, iter);
497         duration_type t_float_avx_u = benchmark_simd_unrolled<batch<float, 8>>(f, f_op0, f_op1, f_op2, f_res, iter);
498 #endif
499         duration_type t_double_scalar = benchmark_scalar(f, d_op0, d_op1, d_op2, d_res, iter);
500 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
501         duration_type t_double_sse = benchmark_simd<batch<double, 2>>(f, d_op0, d_op1, d_op2, d_res, iter);
502         duration_type t_double_sse_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_op0, d_op1, d_op2, d_res, iter);
503 #endif
504 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
505         duration_type t_double_avx = benchmark_simd<batch<double, 4>>(f, d_op0, d_op1, d_op2, d_res, iter);
506         duration_type t_double_avx_u = benchmark_simd_unrolled<batch<double, 4>>(f, d_op0, d_op1, d_op2, d_res, iter);
507 #endif
508 #if defined(XSIMD_ARM_INSTR_SET)
509         duration_type t_float_neon = benchmark_simd<batch<float, 4>>(f, f_op0, f_op1, f_op2, f_res, iter);
510         duration_type t_float_neon_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_op0, f_op1, f_op2, f_res, iter);
511         duration_type t_double_neon = benchmark_simd<batch<double, 2>>(f, d_op0, d_op1, d_op2, d_res, iter);
512         duration_type t_double_neon_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_op0, d_op1, d_op2, d_res, iter);
513 #endif
514 #if defined(XSIMD_ENABLE_FALLBACK)
515         duration_type t_float_fallback = benchmark_simd<batch<float, 7>>(f, f_op0, f_op1, f_op2, f_res, iter);
516         duration_type t_float_fallback_u = benchmark_simd_unrolled<batch<float, 7>>(f, f_op0, f_op1, f_op2, f_res, iter);
517         duration_type t_double_fallback = benchmark_simd<batch<double, 3>>(f, d_op0, d_op1, d_op2, d_res, iter);
518         duration_type t_double_fallback_u = benchmark_simd_unrolled<batch<double, 3>>(f, d_op0, d_op1, d_op2, d_res, iter);
519 #endif
520 
521         out << "============================" << std::endl;
522         out << f.name() << std::endl;
523         out << "scalar float   : " << t_float_scalar.count() << "ms" << std::endl;
524 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
525         out << "sse float      : " << t_float_sse.count() << "ms" << std::endl;
526         out << "sse float unr  : " << t_float_sse_u.count() << "ms" << std::endl;
527 #endif
528 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
529         out << "avx float      : " << t_float_avx.count() << "ms" << std::endl;
530         out << "avx float unr  : " << t_float_avx_u.count() << "ms" << std::endl;
531 #endif
532 #if defined(XSIMD_ARM_INSTR_SET)
533         out << "neon float     : " << t_float_neon.count() << "ms" << std::endl;
534         out << "neon float unr : " << t_float_neon_u.count() << "ms" << std::endl;
535 #endif
536 #if defined(XSIMD_ENABLE_FALLBACK)
537         out << "flbk float     : " << t_float_fallback.count() << "ms" << std::endl;
538         out << "flbk float unr : " << t_float_fallback_u.count() << "ms" << std::endl;
539 #endif
540         out << "scalar double  : " << t_double_scalar.count() << "ms" << std::endl;
541 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION
542         out << "sse double     : " << t_double_sse.count() << "ms" << std::endl;
543         out << "sse double unr : " << t_double_sse_u.count() << "ms" << std::endl;
544 #endif
545 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION
546         out << "avx double     : " << t_double_avx.count() << "ms" << std::endl;
547         out << "avx double unr : " << t_double_avx_u.count() << "ms" << std::endl;
548 #endif
549 #if defined(XSIMD_ARM_INSTR_SET)
550         out << "neon double    : " << t_double_neon.count() << "ms" << std::endl;
551         out << "neon double unr: " << t_double_neon_u.count() << "ms" << std::endl;
552 #endif
553 #if defined(XSIMD_ENABLE_FALLBACK)
554         out << "flbk double    : " << t_double_fallback.count() << "ms" << std::endl;
555         out << "flbk double unr: " << t_double_fallback_u.count() << "ms" << std::endl;
556 #endif
557         out << "============================" << std::endl;
558     }
559 
560 
561 #define DEFINE_OP_FUNCTOR_2OP(OP, NAME)\
562     struct NAME##_fn {\
563         template <class T>\
564         inline T operator()(const T& lhs, const T& rhs) const { return lhs OP rhs; }\
565         inline std::string name() const { return #NAME; }\
566     }
567 
568 #define DEFINE_FUNCTOR_1OP(FN)\
569     struct FN##_fn {\
570         template <class T>\
571         inline T operator()(const T& x) const { using xsimd::FN; return FN(x); }\
572         inline std::string name() const { return #FN; }\
573     }
574 
575 #define DEFINE_FUNCTOR_1OP_TEMPLATE(FN, N, ...)\
576     struct FN##_##N##_fn {\
577         template <class T>\
578         inline T operator()(const T& x) const { using xsimd::FN; return FN<T, __VA_ARGS__>(x); }\
579         inline std::string name() const { return #FN " " #N ; }\
580     }
581 
582 #define DEFINE_FUNCTOR_2OP(FN)\
583     struct FN##_fn{\
584         template <class T>\
585         inline T operator()(const T&lhs, const T& rhs) const { using xsimd::FN; return FN(lhs, rhs); }\
586         inline std::string name() const { return #FN; }\
587     }
588 
589 #define DEFINE_FUNCTOR_3OP(FN)\
590     struct FN##_fn{\
591         template <class T>\
592         inline T operator()(const T& op0, const T& op1, const T& op2) const { using xsimd::FN; return FN(op0, op1, op2); }\
593         inline std::string name() const { return #FN; }\
594     }
595 
596 DEFINE_OP_FUNCTOR_2OP(+, add);
597 DEFINE_OP_FUNCTOR_2OP(-, sub);
598 DEFINE_OP_FUNCTOR_2OP(*, mul);
599 DEFINE_OP_FUNCTOR_2OP(/, div);
600 
601 DEFINE_FUNCTOR_1OP(exp);
602 DEFINE_FUNCTOR_1OP(exp2);
603 DEFINE_FUNCTOR_1OP(expm1);
604 DEFINE_FUNCTOR_1OP(log);
605 DEFINE_FUNCTOR_1OP(log10);
606 DEFINE_FUNCTOR_1OP(log2);
607 DEFINE_FUNCTOR_1OP(log1p);
608 
609 DEFINE_FUNCTOR_1OP(sin);
610 DEFINE_FUNCTOR_1OP(cos);
611 DEFINE_FUNCTOR_1OP(tan);
612 DEFINE_FUNCTOR_1OP(asin);
613 DEFINE_FUNCTOR_1OP(acos);
614 DEFINE_FUNCTOR_1OP(atan);
615 
616 DEFINE_FUNCTOR_1OP(sinh);
617 DEFINE_FUNCTOR_1OP(cosh);
618 DEFINE_FUNCTOR_1OP(tanh);
619 DEFINE_FUNCTOR_1OP(asinh);
620 DEFINE_FUNCTOR_1OP(acosh);
621 DEFINE_FUNCTOR_1OP(atanh);
622 
623 DEFINE_FUNCTOR_2OP(pow);
624 DEFINE_FUNCTOR_1OP(sqrt);
625 DEFINE_FUNCTOR_1OP(cbrt);
626 DEFINE_FUNCTOR_2OP(hypot);
627 
628 DEFINE_FUNCTOR_1OP(ceil);
629 DEFINE_FUNCTOR_1OP(floor);
630 DEFINE_FUNCTOR_1OP(trunc);
631 DEFINE_FUNCTOR_1OP(round);
632 DEFINE_FUNCTOR_1OP(nearbyint);
633 DEFINE_FUNCTOR_1OP(rint);
634 
635 DEFINE_FUNCTOR_2OP(fmod);
636 DEFINE_FUNCTOR_2OP(remainder);
637 DEFINE_FUNCTOR_2OP(fdim);
638 DEFINE_FUNCTOR_3OP(clip);
639 #if 0
640 DEFINE_FUNCTOR_1OP(isfinite);
641 DEFINE_FUNCTOR_1OP(isinf);
642 DEFINE_FUNCTOR_1OP(is_flint);
643 DEFINE_FUNCTOR_1OP(is_odd);
644 DEFINE_FUNCTOR_1OP(is_even);
645 #endif
646 
647 #ifdef XSIMD_POLY_BENCHMARKS
648 DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 5, 1, 2, 3, 4, 5);
649 DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 5, 1, 2, 3, 4, 5);
650 DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
651 DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
652 DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
653 DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
654 DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
655 DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
656 DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
657 DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
658 #endif
659 
660 }
661 
662 #endif
663