1 /*
2 * Copyright 2012-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #pragma once
18
19 #include <folly/Portability.h>
20 #include <folly/Preprocessor.h> // for FB_ANONYMOUS_VARIABLE
21 #include <folly/ScopeGuard.h>
22 #include <folly/Traits.h>
23 #include <folly/functional/Invoke.h>
24 #include <folly/portability/GFlags.h>
25
26 #include <cassert>
27 #include <chrono>
28 #include <functional>
29 #include <limits>
30 #include <type_traits>
31
32 #include <boost/function_types/function_arity.hpp>
33 #include <glog/logging.h>
34
35 DECLARE_bool(benchmark);
36
37 namespace folly {
38
39 /**
40 * Runs all benchmarks defined. Usually put in main().
41 */
42 void runBenchmarks();
43
44 /**
45 * Runs all benchmarks defined if and only if the --benchmark flag has
46 * been passed to the program. Usually put in main().
47 */
runBenchmarksOnFlag()48 inline bool runBenchmarksOnFlag() {
49 if (FLAGS_benchmark) {
50 runBenchmarks();
51 }
52 return FLAGS_benchmark;
53 }
54
55 namespace detail {
56
57 using TimeIterPair =
58 std::pair<std::chrono::high_resolution_clock::duration, unsigned int>;
59 using BenchmarkFun = std::function<detail::TimeIterPair(unsigned int)>;
60
61 struct BenchmarkRegistration {
62 std::string file;
63 std::string name;
64 BenchmarkFun func;
65 };
66
67 struct BenchmarkResult {
68 std::string file;
69 std::string name;
70 double timeInNs;
71 };
72
73 /**
74 * Adds a benchmark wrapped in a std::function. Only used
75 * internally. Pass by value is intentional.
76 */
77 void addBenchmarkImpl(
78 const char* file,
79 const char* name,
80 std::function<TimeIterPair(unsigned int)>);
81
82 } // namespace detail
83
84 /**
85 * Supporting type for BENCHMARK_SUSPEND defined below.
86 */
87 struct BenchmarkSuspender {
88 using Clock = std::chrono::high_resolution_clock;
89 using TimePoint = Clock::time_point;
90 using Duration = Clock::duration;
91
BenchmarkSuspenderBenchmarkSuspender92 BenchmarkSuspender() {
93 start = Clock::now();
94 }
95
96 BenchmarkSuspender(const BenchmarkSuspender&) = delete;
BenchmarkSuspenderBenchmarkSuspender97 BenchmarkSuspender(BenchmarkSuspender&& rhs) noexcept {
98 start = rhs.start;
99 rhs.start = {};
100 }
101
102 BenchmarkSuspender& operator=(const BenchmarkSuspender&) = delete;
103 BenchmarkSuspender& operator=(BenchmarkSuspender&& rhs) {
104 if (start != TimePoint{}) {
105 tally();
106 }
107 start = rhs.start;
108 rhs.start = {};
109 return *this;
110 }
111
~BenchmarkSuspenderBenchmarkSuspender112 ~BenchmarkSuspender() {
113 if (start != TimePoint{}) {
114 tally();
115 }
116 }
117
dismissBenchmarkSuspender118 void dismiss() {
119 assert(start != TimePoint{});
120 tally();
121 start = {};
122 }
123
rehireBenchmarkSuspender124 void rehire() {
125 assert(start == TimePoint{});
126 start = Clock::now();
127 }
128
129 template <class F>
130 auto dismissing(F f) -> invoke_result_t<F> {
131 SCOPE_EXIT {
132 rehire();
133 };
134 dismiss();
135 return f();
136 }
137
138 /**
139 * This is for use inside of if-conditions, used in BENCHMARK macros.
140 * If-conditions bypass the explicit on operator bool.
141 */
142 explicit operator bool() const {
143 return false;
144 }
145
146 /**
147 * Accumulates time spent outside benchmark.
148 */
149 static Duration timeSpent;
150
151 private:
tallyBenchmarkSuspender152 void tally() {
153 auto end = Clock::now();
154 timeSpent += end - start;
155 start = end;
156 }
157
158 TimePoint start;
159 };
160
161 /**
162 * Adds a benchmark. Usually not called directly but instead through
163 * the macro BENCHMARK defined below. The lambda function involved
164 * must take exactly one parameter of type unsigned, and the benchmark
165 * uses it with counter semantics (iteration occurs inside the
166 * function).
167 */
168 template <typename Lambda>
169 typename std::enable_if<
170 boost::function_types::function_arity<
171 decltype(&Lambda::operator())>::value == 2>::type
addBenchmark(const char * file,const char * name,Lambda && lambda)172 addBenchmark(const char* file, const char* name, Lambda&& lambda) {
173 auto execute = [=](unsigned int times) {
174 BenchmarkSuspender::timeSpent = {};
175 unsigned int niter;
176
177 // CORE MEASUREMENT STARTS
178 auto start = std::chrono::high_resolution_clock::now();
179 niter = lambda(times);
180 auto end = std::chrono::high_resolution_clock::now();
181 // CORE MEASUREMENT ENDS
182
183 return detail::TimeIterPair(
184 (end - start) - BenchmarkSuspender::timeSpent, niter);
185 };
186
187 detail::addBenchmarkImpl(
188 file, name, std::function<detail::TimeIterPair(unsigned int)>(execute));
189 }
190
191 /**
192 * Adds a benchmark. Usually not called directly but instead through
193 * the macro BENCHMARK defined below. The lambda function involved
194 * must take zero parameters, and the benchmark calls it repeatedly
195 * (iteration occurs outside the function).
196 */
197 template <typename Lambda>
198 typename std::enable_if<
199 boost::function_types::function_arity<
200 decltype(&Lambda::operator())>::value == 1>::type
addBenchmark(const char * file,const char * name,Lambda && lambda)201 addBenchmark(const char* file, const char* name, Lambda&& lambda) {
202 addBenchmark(file, name, [=](unsigned int times) {
203 unsigned int niter = 0;
204 while (times-- > 0) {
205 niter += lambda();
206 }
207 return niter;
208 });
209 }
210
211 /**
212 * Call doNotOptimizeAway(var) to ensure that var will be computed even
213 * post-optimization. Use it for variables that are computed during
214 * benchmarking but otherwise are useless. The compiler tends to do a
215 * good job at eliminating unused variables, and this function fools it
216 * into thinking var is in fact needed.
217 *
218 * Call makeUnpredictable(var) when you don't want the optimizer to use
219 * its knowledge of var to shape the following code. This is useful
220 * when constant propagation or power reduction is possible during your
221 * benchmark but not in real use cases.
222 */
223
224 #ifdef _MSC_VER
225
226 #pragma optimize("", off)
227
doNotOptimizeDependencySink(const void *)228 inline void doNotOptimizeDependencySink(const void*) {}
229
230 #pragma optimize("", on)
231
232 template <class T>
doNotOptimizeAway(const T & datum)233 void doNotOptimizeAway(const T& datum) {
234 doNotOptimizeDependencySink(&datum);
235 }
236
237 template <typename T>
makeUnpredictable(T & datum)238 void makeUnpredictable(T& datum) {
239 doNotOptimizeDependencySink(&datum);
240 }
241
242 #else
243
244 namespace detail {
245 template <typename T>
246 struct DoNotOptimizeAwayNeedsIndirect {
247 using Decayed = typename std::decay<T>::type;
248
249 // First two constraints ensure it can be an "r" operand.
250 // std::is_pointer check is because callers seem to expect that
251 // doNotOptimizeAway(&x) is equivalent to doNotOptimizeAway(x).
252 constexpr static bool value = !folly::is_trivially_copyable<Decayed>::value ||
253 sizeof(Decayed) > sizeof(long) || std::is_pointer<Decayed>::value;
254 };
255 } // namespace detail
256
257 template <typename T>
258 auto doNotOptimizeAway(const T& datum) -> typename std::enable_if<
259 !detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
260 // The "r" constraint forces the compiler to make datum available
261 // in a register to the asm block, which means that it must have
262 // computed/loaded it. We use this path for things that are <=
263 // sizeof(long) (they have to fit), trivial (otherwise the compiler
264 // doesn't want to put them in a register), and not a pointer (because
265 // doNotOptimizeAway(&foo) would otherwise be a foot gun that didn't
266 // necessarily compute foo).
267 //
268 // An earlier version of this method had a more permissive input operand
269 // constraint, but that caused unnecessary variation between clang and
270 // gcc benchmarks.
271 asm volatile("" ::"r"(datum));
272 }
273
274 template <typename T>
275 auto doNotOptimizeAway(const T& datum) -> typename std::enable_if<
276 detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
277 // This version of doNotOptimizeAway tells the compiler that the asm
278 // block will read datum from memory, and that in addition it might read
279 // or write from any memory location. If the memory clobber could be
280 // separated into input and output that would be preferrable.
281 asm volatile("" ::"m"(datum) : "memory");
282 }
283
284 template <typename T>
285 auto makeUnpredictable(T& datum) -> typename std::enable_if<
286 !detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
287 asm volatile("" : "+r"(datum));
288 }
289
290 template <typename T>
291 auto makeUnpredictable(T& datum) -> typename std::enable_if<
292 detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
293 asm volatile("" ::"m"(datum) : "memory");
294 }
295
296 #endif
297
298 struct dynamic;
299
300 void benchmarkResultsToDynamic(
301 const std::vector<detail::BenchmarkResult>& data,
302 dynamic&);
303
304 void benchmarkResultsFromDynamic(
305 const dynamic&,
306 std::vector<detail::BenchmarkResult>&);
307
308 void printResultComparison(
309 const std::vector<detail::BenchmarkResult>& base,
310 const std::vector<detail::BenchmarkResult>& test);
311
312 } // namespace folly
313
314 /**
315 * Introduces a benchmark function. Used internally, see BENCHMARK and
316 * friends below.
317 */
318 #define BENCHMARK_IMPL(funName, stringName, rv, paramType, paramName) \
319 static void funName(paramType); \
320 static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
321 (::folly::addBenchmark( \
322 __FILE__, \
323 stringName, \
324 [](paramType paramName) -> unsigned { \
325 funName(paramName); \
326 return rv; \
327 }), \
328 true); \
329 static void funName(paramType paramName)
330
331 /**
332 * Introduces a benchmark function with support for returning the actual
333 * number of iterations. Used internally, see BENCHMARK_MULTI and friends
334 * below.
335 */
336 #define BENCHMARK_MULTI_IMPL(funName, stringName, paramType, paramName) \
337 static unsigned funName(paramType); \
338 static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
339 (::folly::addBenchmark( \
340 __FILE__, \
341 stringName, \
342 [](paramType paramName) { return funName(paramName); }), \
343 true); \
344 static unsigned funName(paramType paramName)
345
346 /**
347 * Introduces a benchmark function. Use with either one or two arguments.
348 * The first is the name of the benchmark. Use something descriptive, such
349 * as insertVectorBegin. The second argument may be missing, or could be a
350 * symbolic counter. The counter dictates how many internal iteration the
351 * benchmark does. Example:
352 *
353 * BENCHMARK(vectorPushBack) {
354 * vector<int> v;
355 * v.push_back(42);
356 * }
357 *
358 * BENCHMARK(insertVectorBegin, n) {
359 * vector<int> v;
360 * FOR_EACH_RANGE (i, 0, n) {
361 * v.insert(v.begin(), 42);
362 * }
363 * }
364 */
365 #define BENCHMARK(name, ...) \
366 BENCHMARK_IMPL( \
367 name, \
368 FB_STRINGIZE(name), \
369 FB_ARG_2_OR_1(1, ##__VA_ARGS__), \
370 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
371 __VA_ARGS__)
372
373 /**
374 * Like BENCHMARK above, but allows the user to return the actual
375 * number of iterations executed in the function body. This can be
376 * useful if the benchmark function doesn't know upfront how many
377 * iterations it's going to run or if it runs through a certain
378 * number of test cases, e.g.:
379 *
380 * BENCHMARK_MULTI(benchmarkSomething) {
381 * std::vector<int> testCases { 0, 1, 1, 2, 3, 5 };
382 * for (int c : testCases) {
383 * doSomething(c);
384 * }
385 * return testCases.size();
386 * }
387 */
388 #define BENCHMARK_MULTI(name, ...) \
389 BENCHMARK_MULTI_IMPL( \
390 name, \
391 FB_STRINGIZE(name), \
392 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
393 __VA_ARGS__)
394
395 /**
396 * Defines a benchmark that passes a parameter to another one. This is
397 * common for benchmarks that need a "problem size" in addition to
398 * "number of iterations". Consider:
399 *
400 * void pushBack(uint32_t n, size_t initialSize) {
401 * vector<int> v;
402 * BENCHMARK_SUSPEND {
403 * v.resize(initialSize);
404 * }
405 * FOR_EACH_RANGE (i, 0, n) {
406 * v.push_back(i);
407 * }
408 * }
409 * BENCHMARK_PARAM(pushBack, 0)
410 * BENCHMARK_PARAM(pushBack, 1000)
411 * BENCHMARK_PARAM(pushBack, 1000000)
412 *
413 * The benchmark above estimates the speed of push_back at different
414 * initial sizes of the vector. The framework will pass 0, 1000, and
415 * 1000000 for initialSize, and the iteration count for n.
416 */
417 #define BENCHMARK_PARAM(name, param) BENCHMARK_NAMED_PARAM(name, param, param)
418
419 /**
420 * Same as BENCHMARK_PARAM, but allows one to return the actual number of
421 * iterations that have been run.
422 */
423 #define BENCHMARK_PARAM_MULTI(name, param) \
424 BENCHMARK_NAMED_PARAM_MULTI(name, param, param)
425
426 /*
427 * Like BENCHMARK_PARAM(), but allows a custom name to be specified for each
428 * parameter, rather than using the parameter value.
429 *
430 * Useful when the parameter value is not a valid token for string pasting,
431 * of when you want to specify multiple parameter arguments.
432 *
433 * For example:
434 *
435 * void addValue(uint32_t n, int64_t bucketSize, int64_t min, int64_t max) {
436 * Histogram<int64_t> hist(bucketSize, min, max);
437 * int64_t num = min;
438 * FOR_EACH_RANGE (i, 0, n) {
439 * hist.addValue(num);
440 * ++num;
441 * if (num > max) { num = min; }
442 * }
443 * }
444 *
445 * BENCHMARK_NAMED_PARAM(addValue, 0_to_100, 1, 0, 100)
446 * BENCHMARK_NAMED_PARAM(addValue, 0_to_1000, 10, 0, 1000)
447 * BENCHMARK_NAMED_PARAM(addValue, 5k_to_20k, 250, 5000, 20000)
448 */
449 #define BENCHMARK_NAMED_PARAM(name, param_name, ...) \
450 BENCHMARK_IMPL( \
451 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
452 FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
453 iters, \
454 unsigned, \
455 iters) { \
456 name(iters, ##__VA_ARGS__); \
457 }
458
459 /**
460 * Same as BENCHMARK_NAMED_PARAM, but allows one to return the actual number
461 * of iterations that have been run.
462 */
463 #define BENCHMARK_NAMED_PARAM_MULTI(name, param_name, ...) \
464 BENCHMARK_MULTI_IMPL( \
465 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
466 FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
467 unsigned, \
468 iters) { \
469 return name(iters, ##__VA_ARGS__); \
470 }
471
472 /**
473 * Just like BENCHMARK, but prints the time relative to a
474 * baseline. The baseline is the most recent BENCHMARK() seen in
475 * the current scope. Example:
476 *
477 * // This is the baseline
478 * BENCHMARK(insertVectorBegin, n) {
479 * vector<int> v;
480 * FOR_EACH_RANGE (i, 0, n) {
481 * v.insert(v.begin(), 42);
482 * }
483 * }
484 *
485 * BENCHMARK_RELATIVE(insertListBegin, n) {
486 * list<int> s;
487 * FOR_EACH_RANGE (i, 0, n) {
488 * s.insert(s.begin(), 42);
489 * }
490 * }
491 *
492 * Any number of relative benchmark can be associated with a
493 * baseline. Another BENCHMARK() occurrence effectively establishes a
494 * new baseline.
495 */
496 #define BENCHMARK_RELATIVE(name, ...) \
497 BENCHMARK_IMPL( \
498 name, \
499 "%" FB_STRINGIZE(name), \
500 FB_ARG_2_OR_1(1, ##__VA_ARGS__), \
501 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
502 __VA_ARGS__)
503
504 /**
505 * Same as BENCHMARK_RELATIVE, but allows one to return the actual number
506 * of iterations that have been run.
507 */
508 #define BENCHMARK_RELATIVE_MULTI(name, ...) \
509 BENCHMARK_MULTI_IMPL( \
510 name, \
511 "%" FB_STRINGIZE(name), \
512 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
513 __VA_ARGS__)
514
515 /**
516 * A combination of BENCHMARK_RELATIVE and BENCHMARK_PARAM.
517 */
518 #define BENCHMARK_RELATIVE_PARAM(name, param) \
519 BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param)
520
521 /**
522 * Same as BENCHMARK_RELATIVE_PARAM, but allows one to return the actual
523 * number of iterations that have been run.
524 */
525 #define BENCHMARK_RELATIVE_PARAM_MULTI(name, param) \
526 BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param, param)
527
528 /**
529 * A combination of BENCHMARK_RELATIVE and BENCHMARK_NAMED_PARAM.
530 */
531 #define BENCHMARK_RELATIVE_NAMED_PARAM(name, param_name, ...) \
532 BENCHMARK_IMPL( \
533 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
534 "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
535 iters, \
536 unsigned, \
537 iters) { \
538 name(iters, ##__VA_ARGS__); \
539 }
540
541 /**
542 * Same as BENCHMARK_RELATIVE_NAMED_PARAM, but allows one to return the
543 * actual number of iterations that have been run.
544 */
545 #define BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param_name, ...) \
546 BENCHMARK_MULTI_IMPL( \
547 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
548 "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
549 unsigned, \
550 iters) { \
551 return name(iters, ##__VA_ARGS__); \
552 }
553
554 /**
555 * Draws a line of dashes.
556 */
557 #define BENCHMARK_DRAW_LINE() \
558 static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
559 (::folly::addBenchmark(__FILE__, "-", []() -> unsigned { return 0; }), \
560 true)
561
562 /**
563 * Allows execution of code that doesn't count torward the benchmark's
564 * time budget. Example:
565 *
566 * BENCHMARK_START_GROUP(insertVectorBegin, n) {
567 * vector<int> v;
568 * BENCHMARK_SUSPEND {
569 * v.reserve(n);
570 * }
571 * FOR_EACH_RANGE (i, 0, n) {
572 * v.insert(v.begin(), 42);
573 * }
574 * }
575 */
576 #define BENCHMARK_SUSPEND \
577 if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) = \
578 ::folly::BenchmarkSuspender()) { \
579 } else
580