1 #include <benchmark/benchmark.h>
2 #include <bitset>
3 #include <cstdint>
4 #include <stdio.h>
5
6 #include "../common.h"
7 #include "03_popcnt_ispc.h"
8
9 static Docs docs("Check popcnt implmentation of stdlib functions:\n"
10 "[int32, int64] x [uniform, varying] x [all, even] versions.\n"
11 "Observations:\n"
12 " - popcnt is very lightweight, so 8 popcnt are chained.\n"
13 " - chaining doesn't cause optimizing out popcnt, 8 seems to be good enough"
14 " - varying versions have overhead on insert/extract\n"
15 " - for even versions mask is statically known and compiler is able to optimize out inactive lanes\n"
16 "Expectation:\n"
17 " - No regressions\n");
18
19 // Minimum size is maximum target width, i.e. 64.
20 // Larger buffer is better, but preferably to stay within L1.
21 #define ARGS Arg(8192)
22 //#define ARGS RangeMultiplier(2)->Range(64, 64<<15)->Complexity(benchmark::oN)
23
init(T * src,T * dst,int count)24 template <typename T> static void init(T *src, T *dst, int count) {
25 for (int i = 0; i < count; i++) {
26 src[i] = static_cast<T>(i);
27 dst[i] = 0;
28 }
29 }
30
check_all(T * src,T * dst,int count)31 template <typename T> static void check_all(T *src, T *dst, int count) {
32 for (int i = 0; i < count; i++) {
33 // for single popcnt() here's the formula, but we chain 8 popcnt()
34 // int count = std::bitset< std::numeric_limits<T>::digits >(src[i]).count();
35 int count = (i == 0) ? 0 : 1;
36 if (dst[i] != count) {
37 printf("Error i=%d\n", i);
38 return;
39 }
40 }
41 }
42
check_even(T * src,T * dst,int count)43 template <typename T> static void check_even(T *src, T *dst, int count) {
44 for (int i = 0; i < count; i += 2) {
45 // for single popcnt() here's the formula, but we chain 8 popcnt()
46 // int count = std::bitset< std::numeric_limits<T>::digits >(src[i]).count();
47 int count = (i == 0) ? 0 : 1;
48 if (dst[i] != count) {
49 printf("Error i=%d\n", i);
50 return;
51 }
52 }
53 }
54
55 #define POPCNT(T_C, T_ISPC, V, ALL) \
56 static void popcnt_stdlib_##V##_##T_ISPC##_##ALL(benchmark::State &state) { \
57 int count = static_cast<int>(state.range(0)); \
58 T_C *src = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count)); \
59 T_C *dst = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count)); \
60 init(src, dst, count); \
61 \
62 for (auto _ : state) { \
63 ispc::popcnt_##V##_##T_ISPC##_##ALL(src, dst, count); \
64 } \
65 \
66 check_##ALL(src, dst, count); \
67 aligned_free_helper(src); \
68 aligned_free_helper(dst); \
69 state.SetComplexityN(state.range(0)); \
70 } \
71 BENCHMARK(popcnt_stdlib_##V##_##T_ISPC##_##ALL)->ARGS;
72
73 POPCNT(int, int32, uniform, all)
74 POPCNT(int, int32, varying, all)
75 POPCNT(int, int32, uniform, even)
76 POPCNT(int, int32, varying, even)
77 POPCNT(int64_t, int64, uniform, all)
78 POPCNT(int64_t, int64, varying, all)
79 POPCNT(int64_t, int64, uniform, even)
80 POPCNT(int64_t, int64, varying, even)
81
82 BENCHMARK_MAIN();
83