1 #ifndef BENCHMARK_H
2 #define BENCHMARK_H
3 
4 #include <algorithm>
5 #include <cassert>
6 #include <chrono>
7 #include <functional>
8 #include <limits>
9 
10 #if defined(__EMSCRIPTEN__)
11 #include <emscripten.h>
12 #endif
13 
14 namespace Halide {
15 namespace Tools {
16 
17 #if !defined(__EMSCRIPTEN__)
18 
19 // Prefer high_resolution_clock, but only if it's steady...
20 template<bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
21 struct SteadyClock {
22     using type = std::chrono::high_resolution_clock;
23 };
24 
25 // ...otherwise use steady_clock.
26 template<>
27 struct SteadyClock<false> {
28     using type = std::chrono::steady_clock;
29 };
30 
31 inline SteadyClock<>::type::time_point benchmark_now() {
32     return SteadyClock<>::type::now();
33 }
34 
35 inline double benchmark_duration_seconds(
36     SteadyClock<>::type::time_point start,
37     SteadyClock<>::type::time_point end) {
38     return std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
39 }
40 
41 #else  // __EMSCRIPTEN__
42 
43 // Emscripten's std::chrono::steady_clock and/or high_resolution_clock
44 // can throw an exception (!) if the runtime doesn't have a truly
45 // steady clock available. Advice from emscripten-discuss suggested
46 // that emscripten_get_now() is the best bet, as it is milliseconds
47 // (but returned as a double, with microseconds in the fractional portion),
48 // using either performance.now() or performance.hrtime() depending on the
49 // environment.
50 inline double benchmark_now() {
51     return emscripten_get_now();
52 }
53 
54 inline double benchmark_duration_seconds(double start, double end) {
55     return (end - start) / 1000.0;
56 }
57 
58 #endif
59 
60 // Benchmark the operation 'op'. The number of iterations refers to
61 // how many times the operation is run for each time measurement, the
62 // result is the minimum over a number of samples runs. The result is the
63 // amount of time in seconds for one iteration.
64 //
65 // NOTE: it is usually simpler and more accurate to use the adaptive
66 // version of benchmark() later in this file; this function is provided
67 // for legacy code.
68 //
69 // IMPORTANT NOTE: Using this tool for timing GPU code may be misleading,
70 // as it does not account for time needed to synchronize to/from the GPU;
71 // if the callback doesn't include calls to device_sync(), the reported
72 // time may only be that to queue the requests; if the callback *does*
73 // include calls to device_sync(), it might exaggerate the sync overhead
74 // for real-world use. For now, callers using this to benchmark GPU
75 // code should measure with extreme caution.
76 
77 inline double benchmark(uint64_t samples, uint64_t iterations, const std::function<void()> &op) {
78     double best = std::numeric_limits<double>::infinity();
79     for (uint64_t i = 0; i < samples; i++) {
80         auto start = benchmark_now();
81         for (uint64_t j = 0; j < iterations; j++) {
82             op();
83         }
84         auto end = benchmark_now();
85         double elapsed_seconds = benchmark_duration_seconds(start, end);
86         best = std::min(best, elapsed_seconds);
87     }
88     return best / iterations;
89 }
90 
91 // Benchmark the operation 'op': run the operation until at least min_time
92 // has elapsed; the number of iterations is expanded as we
93 // progress (based on initial runs of 'op') to minimize overhead. The time
94 // reported will be that of the best single iteration.
95 //
96 // Most callers should be able to get good results without needing to specify
97 // custom BenchmarkConfig values.
98 //
99 // IMPORTANT NOTE: Using this tool for timing GPU code may be misleading,
100 // as it does not account for time needed to synchronize to/from the GPU;
101 // if the callback doesn't include calls to device_sync(), the reported
102 // time may only be that to queue the requests; if the callback *does*
103 // include calls to device_sync(), it might exaggerate the sync overhead
104 // for real-world use. For now, callers using this to benchmark GPU
105 // code should measure with extreme caution.
106 
107 constexpr uint64_t kBenchmarkMaxIterations = 1000000000;
108 
109 struct BenchmarkConfig {
110     // Attempt to use this much time (in seconds) for the meaningful samples
111     // taken; initial iterations will be done to find an iterations-per-sample
112     // count that puts the total runtime in this ballpark.
113     double min_time{0.1};
114 
115     // Set an absolute upper time limit. Defaults to min_time * 4.
116     double max_time{0.1 * 4};
117 
118     // Terminate when the relative difference between the best runtime
119     // seen and the third-best runtime seen is no more than
120     // this. Controls accuracy. The closer to zero this gets the more
121     // reliable the answer, but the longer it may take to run.
122     double accuracy{0.03};
123 };
124 
125 struct BenchmarkResult {
126     // Best elapsed wall-clock time per iteration (seconds).
127     double wall_time;
128 
129     // Number of samples used for measurement.
130     // (There might be additional samples taken that are not used
131     // for measurement.)
132     uint64_t samples;
133 
134     // Total number of iterations across all samples.
135     // (There might be additional iterations taken that are not used
136     // for measurement.)
137     uint64_t iterations;
138 
139     // Measured accuracy between the best and third-best result.
140     // Will be <= config.accuracy unless max_time is exceeded.
141     double accuracy;
142 
143     operator double() const {
144         return wall_time;
145     }
146 };
147 
148 inline BenchmarkResult benchmark(const std::function<void()> &op, const BenchmarkConfig &config = {}) {
149     BenchmarkResult result{0, 0, 0};
150 
151     const double min_time = std::max(10 * 1e-6, config.min_time);
152     const double max_time = std::max(config.min_time, config.max_time);
153 
154     const double accuracy = 1.0 + std::min(std::max(0.001, config.accuracy), 0.1);
155 
156     // We will do (at least) kMinSamples samples; we will do additional
157     // samples until the best the kMinSamples'th results are within the
158     // accuracy tolerance (or we run out of iterations).
159     constexpr int kMinSamples = 3;
160     double times[kMinSamples + 1] = {0};
161 
162     double total_time = 0;
163     uint64_t iters_per_sample = 1;
164     for (;;) {
165         result.samples = 0;
166         result.iterations = 0;
167         total_time = 0;
168         for (int i = 0; i < kMinSamples; i++) {
169             times[i] = benchmark(1, iters_per_sample, op);
170             result.samples++;
171             result.iterations += iters_per_sample;
172             total_time += times[i] * iters_per_sample;
173         }
174         std::sort(times, times + kMinSamples);
175         if (times[0] * iters_per_sample * kMinSamples >= min_time) {
176             break;
177         }
178         // Use an estimate based on initial times to converge faster.
179         double next_iters = std::max(min_time / std::max(times[0] * kMinSamples, 1e-9),
180                                      iters_per_sample * 2.0);
181         iters_per_sample = (uint64_t)(next_iters + 0.5);
182     }
183 
184     // - Keep taking samples until we are accurate enough (even if we run over min_time).
185     // - If we are already accurate enough but have time remaining, keep taking samples.
186     // - No matter what, don't go over max_time; this is important, in case
187     // we happen to get faster results for the first samples, then happen to transition
188     // to throttled-down CPU state.
189     while ((times[0] * accuracy < times[kMinSamples - 1] || total_time < min_time) &&
190            total_time < max_time) {
191         times[kMinSamples] = benchmark(1, iters_per_sample, op);
192         result.samples++;
193         result.iterations += iters_per_sample;
194         total_time += times[kMinSamples] * iters_per_sample;
195         std::sort(times, times + kMinSamples + 1);
196     }
197     result.wall_time = times[0];
198     result.accuracy = (times[kMinSamples - 1] / times[0]) - 1.0;
199 
200     return result;
201 }
202 
203 }  // namespace Tools
204 }  // namespace Halide
205 
206 #endif
207