1 #ifndef BENCHMARK_H 2 #define BENCHMARK_H 3 4 #include <algorithm> 5 #include <cassert> 6 #include <chrono> 7 #include <functional> 8 #include <limits> 9 10 #if defined(__EMSCRIPTEN__) 11 #include <emscripten.h> 12 #endif 13 14 namespace Halide { 15 namespace Tools { 16 17 #if !defined(__EMSCRIPTEN__) 18 19 // Prefer high_resolution_clock, but only if it's steady... 20 template<bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady> 21 struct SteadyClock { 22 using type = std::chrono::high_resolution_clock; 23 }; 24 25 // ...otherwise use steady_clock. 26 template<> 27 struct SteadyClock<false> { 28 using type = std::chrono::steady_clock; 29 }; 30 31 inline SteadyClock<>::type::time_point benchmark_now() { 32 return SteadyClock<>::type::now(); 33 } 34 35 inline double benchmark_duration_seconds( 36 SteadyClock<>::type::time_point start, 37 SteadyClock<>::type::time_point end) { 38 return std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count(); 39 } 40 41 #else // __EMSCRIPTEN__ 42 43 // Emscripten's std::chrono::steady_clock and/or high_resolution_clock 44 // can throw an exception (!) if the runtime doesn't have a truly 45 // steady clock available. Advice from emscripten-discuss suggested 46 // that emscripten_get_now() is the best bet, as it is milliseconds 47 // (but returned as a double, with microseconds in the fractional portion), 48 // using either performance.now() or performance.hrtime() depending on the 49 // environment. 50 inline double benchmark_now() { 51 return emscripten_get_now(); 52 } 53 54 inline double benchmark_duration_seconds(double start, double end) { 55 return (end - start) / 1000.0; 56 } 57 58 #endif 59 60 // Benchmark the operation 'op'. The number of iterations refers to 61 // how many times the operation is run for each time measurement, the 62 // result is the minimum over a number of samples runs. The result is the 63 // amount of time in seconds for one iteration. 64 // 65 // NOTE: it is usually simpler and more accurate to use the adaptive 66 // version of benchmark() later in this file; this function is provided 67 // for legacy code. 68 // 69 // IMPORTANT NOTE: Using this tool for timing GPU code may be misleading, 70 // as it does not account for time needed to synchronize to/from the GPU; 71 // if the callback doesn't include calls to device_sync(), the reported 72 // time may only be that to queue the requests; if the callback *does* 73 // include calls to device_sync(), it might exaggerate the sync overhead 74 // for real-world use. For now, callers using this to benchmark GPU 75 // code should measure with extreme caution. 76 77 inline double benchmark(uint64_t samples, uint64_t iterations, const std::function<void()> &op) { 78 double best = std::numeric_limits<double>::infinity(); 79 for (uint64_t i = 0; i < samples; i++) { 80 auto start = benchmark_now(); 81 for (uint64_t j = 0; j < iterations; j++) { 82 op(); 83 } 84 auto end = benchmark_now(); 85 double elapsed_seconds = benchmark_duration_seconds(start, end); 86 best = std::min(best, elapsed_seconds); 87 } 88 return best / iterations; 89 } 90 91 // Benchmark the operation 'op': run the operation until at least min_time 92 // has elapsed; the number of iterations is expanded as we 93 // progress (based on initial runs of 'op') to minimize overhead. The time 94 // reported will be that of the best single iteration. 95 // 96 // Most callers should be able to get good results without needing to specify 97 // custom BenchmarkConfig values. 98 // 99 // IMPORTANT NOTE: Using this tool for timing GPU code may be misleading, 100 // as it does not account for time needed to synchronize to/from the GPU; 101 // if the callback doesn't include calls to device_sync(), the reported 102 // time may only be that to queue the requests; if the callback *does* 103 // include calls to device_sync(), it might exaggerate the sync overhead 104 // for real-world use. For now, callers using this to benchmark GPU 105 // code should measure with extreme caution. 106 107 constexpr uint64_t kBenchmarkMaxIterations = 1000000000; 108 109 struct BenchmarkConfig { 110 // Attempt to use this much time (in seconds) for the meaningful samples 111 // taken; initial iterations will be done to find an iterations-per-sample 112 // count that puts the total runtime in this ballpark. 113 double min_time{0.1}; 114 115 // Set an absolute upper time limit. Defaults to min_time * 4. 116 double max_time{0.1 * 4}; 117 118 // Terminate when the relative difference between the best runtime 119 // seen and the third-best runtime seen is no more than 120 // this. Controls accuracy. The closer to zero this gets the more 121 // reliable the answer, but the longer it may take to run. 122 double accuracy{0.03}; 123 }; 124 125 struct BenchmarkResult { 126 // Best elapsed wall-clock time per iteration (seconds). 127 double wall_time; 128 129 // Number of samples used for measurement. 130 // (There might be additional samples taken that are not used 131 // for measurement.) 132 uint64_t samples; 133 134 // Total number of iterations across all samples. 135 // (There might be additional iterations taken that are not used 136 // for measurement.) 137 uint64_t iterations; 138 139 // Measured accuracy between the best and third-best result. 140 // Will be <= config.accuracy unless max_time is exceeded. 141 double accuracy; 142 143 operator double() const { 144 return wall_time; 145 } 146 }; 147 148 inline BenchmarkResult benchmark(const std::function<void()> &op, const BenchmarkConfig &config = {}) { 149 BenchmarkResult result{0, 0, 0}; 150 151 const double min_time = std::max(10 * 1e-6, config.min_time); 152 const double max_time = std::max(config.min_time, config.max_time); 153 154 const double accuracy = 1.0 + std::min(std::max(0.001, config.accuracy), 0.1); 155 156 // We will do (at least) kMinSamples samples; we will do additional 157 // samples until the best the kMinSamples'th results are within the 158 // accuracy tolerance (or we run out of iterations). 159 constexpr int kMinSamples = 3; 160 double times[kMinSamples + 1] = {0}; 161 162 double total_time = 0; 163 uint64_t iters_per_sample = 1; 164 for (;;) { 165 result.samples = 0; 166 result.iterations = 0; 167 total_time = 0; 168 for (int i = 0; i < kMinSamples; i++) { 169 times[i] = benchmark(1, iters_per_sample, op); 170 result.samples++; 171 result.iterations += iters_per_sample; 172 total_time += times[i] * iters_per_sample; 173 } 174 std::sort(times, times + kMinSamples); 175 if (times[0] * iters_per_sample * kMinSamples >= min_time) { 176 break; 177 } 178 // Use an estimate based on initial times to converge faster. 179 double next_iters = std::max(min_time / std::max(times[0] * kMinSamples, 1e-9), 180 iters_per_sample * 2.0); 181 iters_per_sample = (uint64_t)(next_iters + 0.5); 182 } 183 184 // - Keep taking samples until we are accurate enough (even if we run over min_time). 185 // - If we are already accurate enough but have time remaining, keep taking samples. 186 // - No matter what, don't go over max_time; this is important, in case 187 // we happen to get faster results for the first samples, then happen to transition 188 // to throttled-down CPU state. 189 while ((times[0] * accuracy < times[kMinSamples - 1] || total_time < min_time) && 190 total_time < max_time) { 191 times[kMinSamples] = benchmark(1, iters_per_sample, op); 192 result.samples++; 193 result.iterations += iters_per_sample; 194 total_time += times[kMinSamples] * iters_per_sample; 195 std::sort(times, times + kMinSamples + 1); 196 } 197 result.wall_time = times[0]; 198 result.accuracy = (times[kMinSamples - 1] / times[0]) - 1.0; 199 200 return result; 201 } 202 203 } // namespace Tools 204 } // namespace Halide 205 206 #endif 207