1 /*************************************************************************** 2 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * 3 * Martin Renou * 4 * Copyright (c) QuantStack * 5 * * 6 * Distributed under the terms of the BSD 3-Clause License. * 7 * * 8 * The full license is in the file LICENSE, distributed with this software. * 9 ****************************************************************************/ 10 11 #ifndef XSIMD_BENCHMARK_HPP 12 #define XSIMD_BENCHMARK_HPP 13 14 #include <chrono> 15 #include <string> 16 #include <vector> 17 #include <iostream> 18 #include "xsimd/xsimd.hpp" 19 20 namespace xsimd 21 { 22 template <class T> 23 std::string batch_name(); 24 batch_name()25 template <> inline std::string batch_name<batch<float, 4>>() { return "sse/neon float"; } batch_name()26 template <> inline std::string batch_name<batch<double, 2>>() { return "sse/neon double"; } batch_name()27 template <> inline std::string batch_name<batch<float, 8>>() { return "avx float"; } batch_name()28 template <> inline std::string batch_name<batch<double, 4>>() { return "avx double"; } batch_name()29 template <> inline std::string batch_name<batch<float, 7>>() { return "fallback float"; } batch_name()30 template <> inline std::string batch_name<batch<double, 3>>() { return "fallback double"; } 31 32 using duration_type = std::chrono::duration<double, std::milli>; 33 34 template <class T> 35 using bench_vector = std::vector<T, xsimd::aligned_allocator<T, XSIMD_DEFAULT_ALIGNMENT>>; 36 37 template <class T> init_benchmark(bench_vector<T> & lhs,bench_vector<T> & rhs,bench_vector<T> & res,size_t size)38 void init_benchmark(bench_vector<T>& lhs, bench_vector<T>& rhs, bench_vector<T>& res, size_t size) 39 { 40 lhs.resize(size); 41 rhs.resize(size); 42 res.resize(size); 43 for (size_t i = 0; i < size; ++i) 44 { 45 lhs[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size); 46 rhs[i] = T(10.2) / T(i + 2) + T(0.25); 47 } 48 } 49 50 template <class T> init_benchmark(bench_vector<T> & op0,bench_vector<T> & op1,bench_vector<T> & op2,bench_vector<T> & res,size_t size)51 void init_benchmark(bench_vector<T>& op0, bench_vector<T>& op1, bench_vector<T>& op2, bench_vector<T>& res, size_t size) 52 { 53 op0.resize(size); 54 op1.resize(size); 55 op2.resize(size); 56 res.resize(size); 57 for (size_t i = 0; i < size; ++i) 58 { 59 op0[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size); 60 op1[i] = T(10.2) / T(i + 2) + T(0.25); 61 op2[i] = T(20.1) / T(i + 5) + T(0.65); 62 } 63 } 64 65 template <class T> init_benchmark_arctrigo(bench_vector<T> & lhs,bench_vector<T> & rhs,bench_vector<T> & res,size_t size)66 void init_benchmark_arctrigo(bench_vector<T>& lhs, bench_vector<T>& rhs, bench_vector<T>& res, size_t size) 67 { 68 lhs.resize(size); 69 rhs.resize(size); 70 res.resize(size); 71 for (size_t i = 0; i < size; ++i) 72 { 73 lhs[i] = T(-1.) + T(2.) * T(i) / T(size); 74 rhs[i] = T(i) / T(i + 2) + T(0.25); 75 } 76 } 77 78 enum class init_method 79 { 80 classic, 81 arctrigo 82 }; 83 84 template <class F, class V> benchmark_scalar(F f,V & lhs,V & res,std::size_t number)85 duration_type benchmark_scalar(F f, V& lhs, V& res, std::size_t number) 86 { 87 size_t s = lhs.size(); 88 duration_type t_res = duration_type::max(); 89 for (std::size_t count = 0; count < number; ++count) 90 { 91 auto start = std::chrono::steady_clock::now(); 92 for (size_t i = 0; i < s; ++i) 93 { 94 res[i] = f(lhs[i]); 95 } 96 auto end = std::chrono::steady_clock::now(); 97 auto tmp = end - start; 98 t_res = tmp < t_res ? tmp : t_res; 99 } 100 return t_res; 101 } 102 103 template <class F, class V> benchmark_scalar(F f,V & lhs,V & rhs,V & res,std::size_t number)104 duration_type benchmark_scalar(F f, V& lhs, V& rhs, V& res, std::size_t number) 105 { 106 size_t s = lhs.size(); 107 duration_type t_res = duration_type::max(); 108 for (std::size_t count = 0; count < number; ++count) 109 { 110 auto start = std::chrono::steady_clock::now(); 111 for (size_t i = 0; i < s; ++i) 112 { 113 res[i] = f(lhs[i], rhs[i]); 114 } 115 auto end = std::chrono::steady_clock::now(); 116 auto tmp = end - start; 117 t_res = tmp < t_res ? tmp : t_res; 118 } 119 return t_res; 120 } 121 122 template <class F, class V> benchmark_scalar(F f,V & op0,V & op1,V & op2,V & res,std::size_t number)123 duration_type benchmark_scalar(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) 124 { 125 size_t s = op0.size(); 126 duration_type t_res = duration_type::max(); 127 for (std::size_t count = 0; count < number; ++count) 128 { 129 auto start = std::chrono::steady_clock::now(); 130 for (size_t i = 0; i < s; ++i) 131 { 132 res[i] = f(op0[i], op1[i], op2[i]); 133 } 134 auto end = std::chrono::steady_clock::now(); 135 auto tmp = end - start; 136 t_res = tmp < t_res ? tmp : t_res; 137 } 138 return t_res; 139 } 140 141 template <class B, class F, class V> benchmark_simd(F f,V & lhs,V & res,std::size_t number)142 duration_type benchmark_simd(F f, V& lhs, V& res, std::size_t number) 143 { 144 std::size_t s = lhs.size(); 145 duration_type t_res = duration_type::max(); 146 for (std::size_t count = 0; count < number; ++count) 147 { 148 auto start = std::chrono::steady_clock::now(); 149 for (std::size_t i = 0; i <= (s - B::size); i += B::size) 150 { 151 B blhs(&lhs[i], aligned_mode()); 152 B bres = f(blhs); 153 bres.store_aligned(&res[i]); 154 } 155 auto end = std::chrono::steady_clock::now(); 156 auto tmp = end - start; 157 t_res = tmp < t_res ? tmp : t_res; 158 } 159 return t_res; 160 } 161 162 template <class B, class F, class V> benchmark_simd_unrolled(F f,V & lhs,V & res,std::size_t number)163 duration_type benchmark_simd_unrolled(F f, V& lhs, V& res, std::size_t number) 164 { 165 std::size_t s = lhs.size(); 166 std::size_t inc = 4 * B::size; 167 duration_type t_res = duration_type::max(); 168 for (std::size_t count = 0; count < number; ++count) 169 { 170 auto start = std::chrono::steady_clock::now(); 171 for (std::size_t i = 0; i <= (s - inc); i += inc) 172 { 173 size_t j = i + B::size; 174 size_t k = j + B::size; 175 size_t l = k + B::size; 176 B blhs(&lhs[i], aligned_mode()), blhs2(&lhs[j], aligned_mode()), 177 blhs3(&lhs[k], aligned_mode()), blhs4(&lhs[l], aligned_mode()); 178 B bres = f(blhs); 179 B bres2 = f(blhs2); 180 B bres3 = f(blhs3); 181 B bres4 = f(blhs4); 182 bres.store_aligned(&res[i]); 183 bres2.store_aligned(&res[j]); 184 bres3.store_aligned(&res[k]); 185 bres4.store_aligned(&res[l]); 186 } 187 auto end = std::chrono::steady_clock::now(); 188 auto tmp = end - start; 189 t_res = tmp < t_res ? tmp : t_res; 190 } 191 return t_res; 192 } 193 194 template <class B, class F, class V> benchmark_simd(F f,V & lhs,V & rhs,V & res,std::size_t number)195 duration_type benchmark_simd(F f, V& lhs, V& rhs, V& res, std::size_t number) 196 { 197 std::size_t s = lhs.size(); 198 duration_type t_res = duration_type::max(); 199 for (std::size_t count = 0; count < number; ++count) 200 { 201 auto start = std::chrono::steady_clock::now(); 202 for (std::size_t i = 0; i <= (s - B::size); i += B::size) 203 { 204 B blhs(&lhs[i], aligned_mode()), brhs(&rhs[i], aligned_mode()); 205 B bres = f(blhs, brhs); 206 bres.store_aligned(&res[i]); 207 } 208 auto end = std::chrono::steady_clock::now(); 209 auto tmp = end - start; 210 t_res = tmp < t_res ? tmp : t_res; 211 } 212 return t_res; 213 } 214 215 template <class B, class F, class V> benchmark_simd_unrolled(F f,V & lhs,V & rhs,V & res,std::size_t number)216 duration_type benchmark_simd_unrolled(F f, V& lhs, V& rhs, V& res, std::size_t number) 217 { 218 std::size_t s = lhs.size(); 219 std::size_t inc = 4 * B::size; 220 duration_type t_res = duration_type::max(); 221 for (std::size_t count = 0; count < number; ++count) 222 { 223 auto start = std::chrono::steady_clock::now(); 224 for (std::size_t i = 0; i <= (s - inc); i += inc) 225 { 226 size_t j = i + B::size; 227 size_t k = j + B::size; 228 size_t l = k + B::size; 229 B blhs(&lhs[i], aligned_mode()), brhs(&rhs[i], aligned_mode()), 230 blhs2(&lhs[j], aligned_mode()), brhs2(&rhs[j], aligned_mode()); 231 B blhs3(&lhs[k], aligned_mode()), brhs3(&rhs[k], aligned_mode()), 232 blhs4(&lhs[l], aligned_mode()), brhs4(&rhs[l], aligned_mode()); 233 B bres = f(blhs, brhs); 234 B bres2 = f(blhs2, brhs2); 235 B bres3 = f(blhs3, brhs3); 236 B bres4 = f(blhs4, brhs4); 237 bres.store_aligned(&res[i]); 238 bres2.store_aligned(&res[j]); 239 bres3.store_aligned(&res[k]); 240 bres4.store_aligned(&res[l]); 241 } 242 auto end = std::chrono::steady_clock::now(); 243 auto tmp = end - start; 244 t_res = tmp < t_res ? tmp : t_res; 245 } 246 return t_res; 247 } 248 249 250 template <class B, class F, class V> benchmark_simd(F f,V & op0,V & op1,V & op2,V & res,std::size_t number)251 duration_type benchmark_simd(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) 252 { 253 std::size_t s = op0.size(); 254 duration_type t_res = duration_type::max(); 255 for (std::size_t count = 0; count < number; ++count) 256 { 257 auto start = std::chrono::steady_clock::now(); 258 for (std::size_t i = 0; i <= (s - B::size); i += B::size) 259 { 260 B bop0(&op0[i], aligned_mode()), 261 bop1(&op1[i], aligned_mode()), 262 bop2(&op2[i], aligned_mode()); 263 B bres = f(bop0, bop1, bop2); 264 bres.store_aligned(&res[i]); 265 } 266 auto end = std::chrono::steady_clock::now(); 267 auto tmp = end - start; 268 t_res = tmp < t_res ? tmp : t_res; 269 } 270 return t_res; 271 } 272 273 template <class B, class F, class V> benchmark_simd_unrolled(F f,V & op0,V & op1,V & op2,V & res,std::size_t number)274 duration_type benchmark_simd_unrolled(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) 275 { 276 std::size_t s = op0.size(); 277 std::size_t inc = 4 * B::size; 278 duration_type t_res = duration_type::max(); 279 for (std::size_t count = 0; count < number; ++count) 280 { 281 auto start = std::chrono::steady_clock::now(); 282 for (std::size_t i = 0; i <= (s - inc); i += inc) 283 { 284 size_t j = i + B::size; 285 size_t k = j + B::size; 286 size_t l = k + B::size; 287 B bop0_i(&op0[i], aligned_mode()), bop1_i(&op1[i], aligned_mode()), bop2_i(&op2[i], aligned_mode()); 288 B bop0_j(&op0[j], aligned_mode()), bop1_j(&op1[j], aligned_mode()), bop2_j(&op2[j], aligned_mode()); 289 B bop0_k(&op0[k], aligned_mode()), bop1_k(&op1[k], aligned_mode()), bop2_k(&op2[k], aligned_mode()); 290 B bop0_l(&op0[l], aligned_mode()), bop1_l(&op1[l], aligned_mode()), bop2_l(&op2[l], aligned_mode()); 291 B bres_i = f(bop0_i, bop1_i, bop2_i); 292 B bres_j = f(bop0_j, bop1_j, bop2_j); 293 B bres_k = f(bop0_k, bop1_k, bop2_k); 294 B bres_l = f(bop0_l, bop1_l, bop2_l); 295 bres_i.store_aligned(&res[i]); 296 bres_j.store_aligned(&res[j]); 297 bres_k.store_aligned(&res[k]); 298 bres_l.store_aligned(&res[l]); 299 } 300 auto end = std::chrono::steady_clock::now(); 301 auto tmp = end - start; 302 t_res = tmp < t_res ? tmp : t_res; 303 } 304 return t_res; 305 } 306 307 template <class F, class OS> run_benchmark_1op(F f,OS & out,std::size_t size,std::size_t iter,init_method init=init_method::classic)308 void run_benchmark_1op(F f, OS& out, std::size_t size, std::size_t iter, init_method init = init_method::classic) 309 { 310 bench_vector<float> f_lhs, f_rhs, f_res; 311 bench_vector<double> d_lhs, d_rhs, d_res; 312 313 switch (init) 314 { 315 case init_method::classic: 316 init_benchmark(f_lhs, f_rhs, f_res, size); 317 init_benchmark(d_lhs, d_rhs, d_res, size); 318 break; 319 case init_method::arctrigo: 320 init_benchmark_arctrigo(f_lhs, f_rhs, f_res, size); 321 init_benchmark_arctrigo(d_lhs, d_rhs, d_res, size); 322 break; 323 default: 324 init_benchmark(f_lhs, f_rhs, f_res, size); 325 init_benchmark(d_lhs, d_rhs, d_res, size); 326 break; 327 } 328 329 #ifndef XSIMD_POLY_BENCHMARKS 330 duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_res, iter); 331 duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_res, iter); 332 #endif 333 334 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 335 duration_type t_float_sse = benchmark_simd<batch<float, 4>>(f, f_lhs, f_res, iter); 336 duration_type t_float_sse_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_lhs, f_res, iter); 337 duration_type t_double_sse = benchmark_simd<batch<double, 2>>(f, d_lhs, d_res, iter); 338 duration_type t_double_sse_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_lhs, d_res, iter); 339 #endif 340 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 341 duration_type t_float_avx = benchmark_simd<batch<float, 8>>(f, f_lhs, f_res, iter); 342 duration_type t_float_avx_u = benchmark_simd_unrolled<batch<float, 8>>(f, f_lhs, f_res, iter); 343 duration_type t_double_avx = benchmark_simd<batch<double, 4>>(f, d_lhs, d_res, iter); 344 duration_type t_double_avx_u = benchmark_simd_unrolled<batch<double, 4>>(f, d_lhs, d_res, iter); 345 #endif 346 #if defined(XSIMD_ARM_INSTR_SET) 347 duration_type t_float_neon = benchmark_simd<batch<float, 4>>(f, f_lhs, f_res, iter); 348 duration_type t_float_neon_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_lhs, f_res, iter); 349 duration_type t_double_neon = benchmark_simd<batch<double, 2>>(f, d_lhs, d_res, iter); 350 duration_type t_double_neon_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_lhs, d_res, iter); 351 #endif 352 #if defined(XSIMD_ENABLE_FALLBACK) 353 duration_type t_float_fallback = benchmark_simd<batch<float, 7>>(f, f_lhs, f_res, iter); 354 duration_type t_float_fallback_u = benchmark_simd_unrolled<batch<float, 7>>(f, f_lhs, f_res, iter); 355 duration_type t_double_fallback = benchmark_simd<batch<double, 3>>(f, d_lhs, d_res, iter); 356 duration_type t_double_fallback_u = benchmark_simd_unrolled<batch<double, 3>>(f, d_lhs, d_res, iter); 357 #endif 358 359 out << "============================" << std::endl; 360 out << f.name() << std::endl; 361 #ifndef XSIMD_POLY_BENCHMARKS 362 out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; 363 #endif 364 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 365 out << "sse float : " << t_float_sse.count() << "ms" << std::endl; 366 out << "sse float unr : " << t_float_sse_u.count() << "ms" << std::endl; 367 #endif 368 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 369 out << "avx float : " << t_float_avx.count() << "ms" << std::endl; 370 out << "avx float unr : " << t_float_avx_u.count() << "ms" << std::endl; 371 #endif 372 #if defined(XSIMD_ARM_INSTR_SET) 373 out << "neon float : " << t_float_neon.count() << "ms" << std::endl; 374 out << "neon float unr : " << t_float_neon_u.count() << "ms" << std::endl; 375 #endif 376 #if defined(XSIMD_ENABLE_FALLBACK) 377 out << "flbk float : " << t_float_fallback.count() << "ms" << std::endl; 378 out << "flbk float unr : " << t_float_fallback_u.count() << "ms" << std::endl; 379 #endif 380 #ifndef XSIMD_POLY_BENCHMARKS 381 out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; 382 #endif 383 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 384 out << "sse double : " << t_double_sse.count() << "ms" << std::endl; 385 out << "sse double unr : " << t_double_sse_u.count() << "ms" << std::endl; 386 #endif 387 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 388 out << "avx double : " << t_double_avx.count() << "ms" << std::endl; 389 out << "avx double unr : " << t_double_avx_u.count() << "ms" << std::endl; 390 #endif 391 #if defined(XSIMD_ARM_INSTR_SET) 392 out << "neon double : " << t_double_neon.count() << "ms" << std::endl; 393 out << "neon double unr: " << t_double_neon_u.count() << "ms" << std::endl; 394 #endif 395 #if defined(XSIMD_ENABLE_FALLBACK) 396 out << "flbk double : " << t_double_fallback.count() << "ms" << std::endl; 397 out << "flbk double unr: " << t_double_fallback_u.count() << "ms" << std::endl; 398 #endif 399 out << "============================" << std::endl; 400 } 401 402 template <class F, class OS> run_benchmark_2op(F f,OS & out,std::size_t size,std::size_t iter)403 void run_benchmark_2op(F f, OS& out, std::size_t size, std::size_t iter) 404 { 405 bench_vector<float> f_lhs, f_rhs, f_res; 406 bench_vector<double> d_lhs, d_rhs, d_res; 407 408 init_benchmark(f_lhs, f_rhs, f_res, size); 409 init_benchmark(d_lhs, d_rhs, d_res, size); 410 411 duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_rhs, f_res, iter); 412 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 413 duration_type t_float_sse = benchmark_simd<batch<float, 4>>(f, f_lhs, f_rhs, f_res, iter); 414 duration_type t_float_sse_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_lhs, f_rhs, f_res, iter); 415 #endif 416 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 417 duration_type t_float_avx = benchmark_simd<batch<float, 8>>(f, f_lhs, f_rhs, f_res, iter); 418 duration_type t_float_avx_u = benchmark_simd_unrolled<batch<float, 8>>(f, f_lhs, f_rhs, f_res, iter); 419 #endif 420 duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_rhs, d_res, iter); 421 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 422 duration_type t_double_sse = benchmark_simd<batch<double, 2>>(f, d_lhs, d_rhs, d_res, iter); 423 duration_type t_double_sse_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_lhs, d_rhs, d_res, iter); 424 #endif 425 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 426 duration_type t_double_avx = benchmark_simd<batch<double, 4>>(f, d_lhs, d_rhs, d_res, iter); 427 duration_type t_double_avx_u = benchmark_simd_unrolled<batch<double, 4>>(f, d_lhs, d_rhs, d_res, iter); 428 #endif 429 #if defined(XSIMD_ARM_INSTR_SET) 430 duration_type t_float_neon = benchmark_simd<batch<float, 4>>(f, f_lhs, f_rhs, f_res, iter); 431 duration_type t_float_neon_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_lhs, f_rhs, f_res, iter); 432 duration_type t_double_neon = benchmark_simd<batch<double, 2>>(f, d_lhs, d_rhs, d_res, iter); 433 duration_type t_double_neon_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_lhs, d_rhs, d_res, iter); 434 #endif 435 #if defined(XSIMD_ENABLE_FALLBACK) 436 duration_type t_float_fallback = benchmark_simd<batch<float, 7>>(f, f_lhs, f_rhs, f_res, iter); 437 duration_type t_float_fallback_u = benchmark_simd_unrolled<batch<float, 7>>(f, f_lhs, f_rhs, f_res, iter); 438 duration_type t_double_fallback = benchmark_simd<batch<double, 3>>(f, d_lhs, d_rhs, d_res, iter); 439 duration_type t_double_fallback_u = benchmark_simd_unrolled<batch<double, 3>>(f, d_lhs, d_rhs, d_res, iter); 440 #endif 441 442 out << "============================" << std::endl; 443 out << f.name() << std::endl; 444 out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; 445 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 446 out << "sse float : " << t_float_sse.count() << "ms" << std::endl; 447 out << "sse float unr : " << t_float_sse_u.count() << "ms" << std::endl; 448 #endif 449 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 450 out << "avx float : " << t_float_avx.count() << "ms" << std::endl; 451 out << "avx float unr : " << t_float_avx_u.count() << "ms" << std::endl; 452 #endif 453 #if defined(XSIMD_ARM_INSTR_SET) 454 out << "neon float : " << t_float_neon.count() << "ms" << std::endl; 455 out << "neon float unr : " << t_float_neon_u.count() << "ms" << std::endl; 456 #endif 457 #if defined(XSIMD_ENABLE_FALLBACK) 458 out << "flbk float : " << t_float_fallback.count() << "ms" << std::endl; 459 out << "flbk float unr : " << t_float_fallback_u.count() << "ms" << std::endl; 460 #endif 461 out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; 462 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 463 out << "sse double : " << t_double_sse.count() << "ms" << std::endl; 464 out << "sse double unr : " << t_double_sse_u.count() << "ms" << std::endl; 465 #endif 466 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 467 out << "avx double : " << t_double_avx.count() << "ms" << std::endl; 468 out << "avx double unr : " << t_double_avx_u.count() << "ms" << std::endl; 469 #endif 470 #if defined(XSIMD_ARM_INSTR_SET) 471 out << "neon double : " << t_double_neon.count() << "ms" << std::endl; 472 out << "neon double unr: " << t_double_neon_u.count() << "ms" << std::endl; 473 #endif 474 #if defined(XSIMD_ENABLE_FALLBACK) 475 out << "flbk double : " << t_double_fallback.count() << "ms" << std::endl; 476 out << "flbk double unr: " << t_double_fallback_u.count() << "ms" << std::endl; 477 #endif 478 out << "============================" << std::endl; 479 } 480 481 template <class F, class OS> run_benchmark_3op(F f,OS & out,std::size_t size,std::size_t iter)482 void run_benchmark_3op(F f, OS& out, std::size_t size, std::size_t iter) 483 { 484 bench_vector<float> f_op0, f_op1, f_op2, f_res; 485 bench_vector<double> d_op0, d_op1, d_op2, d_res; 486 487 init_benchmark(f_op0, f_op1, f_op2, f_res, size); 488 init_benchmark(d_op0, d_op1, d_op2, d_res, size); 489 490 duration_type t_float_scalar = benchmark_scalar(f, f_op0, f_op1, f_op2, f_res, iter); 491 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 492 duration_type t_float_sse = benchmark_simd<batch<float, 4>>(f, f_op0, f_op1, f_op2, f_res, iter); 493 duration_type t_float_sse_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_op0, f_op1, f_op2, f_res, iter); 494 #endif 495 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 496 duration_type t_float_avx = benchmark_simd<batch<float, 8>>(f, f_op0, f_op1, f_op2, f_res, iter); 497 duration_type t_float_avx_u = benchmark_simd_unrolled<batch<float, 8>>(f, f_op0, f_op1, f_op2, f_res, iter); 498 #endif 499 duration_type t_double_scalar = benchmark_scalar(f, d_op0, d_op1, d_op2, d_res, iter); 500 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 501 duration_type t_double_sse = benchmark_simd<batch<double, 2>>(f, d_op0, d_op1, d_op2, d_res, iter); 502 duration_type t_double_sse_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_op0, d_op1, d_op2, d_res, iter); 503 #endif 504 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 505 duration_type t_double_avx = benchmark_simd<batch<double, 4>>(f, d_op0, d_op1, d_op2, d_res, iter); 506 duration_type t_double_avx_u = benchmark_simd_unrolled<batch<double, 4>>(f, d_op0, d_op1, d_op2, d_res, iter); 507 #endif 508 #if defined(XSIMD_ARM_INSTR_SET) 509 duration_type t_float_neon = benchmark_simd<batch<float, 4>>(f, f_op0, f_op1, f_op2, f_res, iter); 510 duration_type t_float_neon_u = benchmark_simd_unrolled<batch<float, 4>>(f, f_op0, f_op1, f_op2, f_res, iter); 511 duration_type t_double_neon = benchmark_simd<batch<double, 2>>(f, d_op0, d_op1, d_op2, d_res, iter); 512 duration_type t_double_neon_u = benchmark_simd_unrolled<batch<double, 2>>(f, d_op0, d_op1, d_op2, d_res, iter); 513 #endif 514 #if defined(XSIMD_ENABLE_FALLBACK) 515 duration_type t_float_fallback = benchmark_simd<batch<float, 7>>(f, f_op0, f_op1, f_op2, f_res, iter); 516 duration_type t_float_fallback_u = benchmark_simd_unrolled<batch<float, 7>>(f, f_op0, f_op1, f_op2, f_res, iter); 517 duration_type t_double_fallback = benchmark_simd<batch<double, 3>>(f, d_op0, d_op1, d_op2, d_res, iter); 518 duration_type t_double_fallback_u = benchmark_simd_unrolled<batch<double, 3>>(f, d_op0, d_op1, d_op2, d_res, iter); 519 #endif 520 521 out << "============================" << std::endl; 522 out << f.name() << std::endl; 523 out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; 524 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 525 out << "sse float : " << t_float_sse.count() << "ms" << std::endl; 526 out << "sse float unr : " << t_float_sse_u.count() << "ms" << std::endl; 527 #endif 528 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 529 out << "avx float : " << t_float_avx.count() << "ms" << std::endl; 530 out << "avx float unr : " << t_float_avx_u.count() << "ms" << std::endl; 531 #endif 532 #if defined(XSIMD_ARM_INSTR_SET) 533 out << "neon float : " << t_float_neon.count() << "ms" << std::endl; 534 out << "neon float unr : " << t_float_neon_u.count() << "ms" << std::endl; 535 #endif 536 #if defined(XSIMD_ENABLE_FALLBACK) 537 out << "flbk float : " << t_float_fallback.count() << "ms" << std::endl; 538 out << "flbk float unr : " << t_float_fallback_u.count() << "ms" << std::endl; 539 #endif 540 out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; 541 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION 542 out << "sse double : " << t_double_sse.count() << "ms" << std::endl; 543 out << "sse double unr : " << t_double_sse_u.count() << "ms" << std::endl; 544 #endif 545 #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION 546 out << "avx double : " << t_double_avx.count() << "ms" << std::endl; 547 out << "avx double unr : " << t_double_avx_u.count() << "ms" << std::endl; 548 #endif 549 #if defined(XSIMD_ARM_INSTR_SET) 550 out << "neon double : " << t_double_neon.count() << "ms" << std::endl; 551 out << "neon double unr: " << t_double_neon_u.count() << "ms" << std::endl; 552 #endif 553 #if defined(XSIMD_ENABLE_FALLBACK) 554 out << "flbk double : " << t_double_fallback.count() << "ms" << std::endl; 555 out << "flbk double unr: " << t_double_fallback_u.count() << "ms" << std::endl; 556 #endif 557 out << "============================" << std::endl; 558 } 559 560 561 #define DEFINE_OP_FUNCTOR_2OP(OP, NAME)\ 562 struct NAME##_fn {\ 563 template <class T>\ 564 inline T operator()(const T& lhs, const T& rhs) const { return lhs OP rhs; }\ 565 inline std::string name() const { return #NAME; }\ 566 } 567 568 #define DEFINE_FUNCTOR_1OP(FN)\ 569 struct FN##_fn {\ 570 template <class T>\ 571 inline T operator()(const T& x) const { using xsimd::FN; return FN(x); }\ 572 inline std::string name() const { return #FN; }\ 573 } 574 575 #define DEFINE_FUNCTOR_1OP_TEMPLATE(FN, N, ...)\ 576 struct FN##_##N##_fn {\ 577 template <class T>\ 578 inline T operator()(const T& x) const { using xsimd::FN; return FN<T, __VA_ARGS__>(x); }\ 579 inline std::string name() const { return #FN " " #N ; }\ 580 } 581 582 #define DEFINE_FUNCTOR_2OP(FN)\ 583 struct FN##_fn{\ 584 template <class T>\ 585 inline T operator()(const T&lhs, const T& rhs) const { using xsimd::FN; return FN(lhs, rhs); }\ 586 inline std::string name() const { return #FN; }\ 587 } 588 589 #define DEFINE_FUNCTOR_3OP(FN)\ 590 struct FN##_fn{\ 591 template <class T>\ 592 inline T operator()(const T& op0, const T& op1, const T& op2) const { using xsimd::FN; return FN(op0, op1, op2); }\ 593 inline std::string name() const { return #FN; }\ 594 } 595 596 DEFINE_OP_FUNCTOR_2OP(+, add); 597 DEFINE_OP_FUNCTOR_2OP(-, sub); 598 DEFINE_OP_FUNCTOR_2OP(*, mul); 599 DEFINE_OP_FUNCTOR_2OP(/, div); 600 601 DEFINE_FUNCTOR_1OP(exp); 602 DEFINE_FUNCTOR_1OP(exp2); 603 DEFINE_FUNCTOR_1OP(expm1); 604 DEFINE_FUNCTOR_1OP(log); 605 DEFINE_FUNCTOR_1OP(log10); 606 DEFINE_FUNCTOR_1OP(log2); 607 DEFINE_FUNCTOR_1OP(log1p); 608 609 DEFINE_FUNCTOR_1OP(sin); 610 DEFINE_FUNCTOR_1OP(cos); 611 DEFINE_FUNCTOR_1OP(tan); 612 DEFINE_FUNCTOR_1OP(asin); 613 DEFINE_FUNCTOR_1OP(acos); 614 DEFINE_FUNCTOR_1OP(atan); 615 616 DEFINE_FUNCTOR_1OP(sinh); 617 DEFINE_FUNCTOR_1OP(cosh); 618 DEFINE_FUNCTOR_1OP(tanh); 619 DEFINE_FUNCTOR_1OP(asinh); 620 DEFINE_FUNCTOR_1OP(acosh); 621 DEFINE_FUNCTOR_1OP(atanh); 622 623 DEFINE_FUNCTOR_2OP(pow); 624 DEFINE_FUNCTOR_1OP(sqrt); 625 DEFINE_FUNCTOR_1OP(cbrt); 626 DEFINE_FUNCTOR_2OP(hypot); 627 628 DEFINE_FUNCTOR_1OP(ceil); 629 DEFINE_FUNCTOR_1OP(floor); 630 DEFINE_FUNCTOR_1OP(trunc); 631 DEFINE_FUNCTOR_1OP(round); 632 DEFINE_FUNCTOR_1OP(nearbyint); 633 DEFINE_FUNCTOR_1OP(rint); 634 635 DEFINE_FUNCTOR_2OP(fmod); 636 DEFINE_FUNCTOR_2OP(remainder); 637 DEFINE_FUNCTOR_2OP(fdim); 638 DEFINE_FUNCTOR_3OP(clip); 639 #if 0 640 DEFINE_FUNCTOR_1OP(isfinite); 641 DEFINE_FUNCTOR_1OP(isinf); 642 DEFINE_FUNCTOR_1OP(is_flint); 643 DEFINE_FUNCTOR_1OP(is_odd); 644 DEFINE_FUNCTOR_1OP(is_even); 645 #endif 646 647 #ifdef XSIMD_POLY_BENCHMARKS 648 DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 5, 1, 2, 3, 4, 5); 649 DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 5, 1, 2, 3, 4, 5); 650 DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); 651 DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); 652 DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); 653 DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); 654 DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 655 DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 656 DEFINE_FUNCTOR_1OP_TEMPLATE(horner, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 657 DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 658 #endif 659 660 } 661 662 #endif 663