1 /*
2  * Copyright 2012-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <folly/Portability.h>
20 #include <folly/Preprocessor.h> // for FB_ANONYMOUS_VARIABLE
21 #include <folly/ScopeGuard.h>
22 #include <folly/Traits.h>
23 #include <folly/functional/Invoke.h>
24 #include <folly/portability/GFlags.h>
25 
26 #include <cassert>
27 #include <chrono>
28 #include <functional>
29 #include <limits>
30 #include <type_traits>
31 
32 #include <boost/function_types/function_arity.hpp>
33 #include <glog/logging.h>
34 
35 DECLARE_bool(benchmark);
36 
37 namespace folly {
38 
39 /**
40  * Runs all benchmarks defined. Usually put in main().
41  */
42 void runBenchmarks();
43 
44 /**
45  * Runs all benchmarks defined if and only if the --benchmark flag has
46  * been passed to the program. Usually put in main().
47  */
runBenchmarksOnFlag()48 inline bool runBenchmarksOnFlag() {
49   if (FLAGS_benchmark) {
50     runBenchmarks();
51   }
52   return FLAGS_benchmark;
53 }
54 
55 namespace detail {
56 
57 using TimeIterPair =
58     std::pair<std::chrono::high_resolution_clock::duration, unsigned int>;
59 using BenchmarkFun = std::function<detail::TimeIterPair(unsigned int)>;
60 
61 struct BenchmarkRegistration {
62   std::string file;
63   std::string name;
64   BenchmarkFun func;
65 };
66 
67 struct BenchmarkResult {
68   std::string file;
69   std::string name;
70   double timeInNs;
71 };
72 
73 /**
74  * Adds a benchmark wrapped in a std::function. Only used
75  * internally. Pass by value is intentional.
76  */
77 void addBenchmarkImpl(
78     const char* file,
79     const char* name,
80     std::function<TimeIterPair(unsigned int)>);
81 
82 } // namespace detail
83 
84 /**
85  * Supporting type for BENCHMARK_SUSPEND defined below.
86  */
87 struct BenchmarkSuspender {
88   using Clock = std::chrono::high_resolution_clock;
89   using TimePoint = Clock::time_point;
90   using Duration = Clock::duration;
91 
BenchmarkSuspenderBenchmarkSuspender92   BenchmarkSuspender() {
93     start = Clock::now();
94   }
95 
96   BenchmarkSuspender(const BenchmarkSuspender&) = delete;
BenchmarkSuspenderBenchmarkSuspender97   BenchmarkSuspender(BenchmarkSuspender&& rhs) noexcept {
98     start = rhs.start;
99     rhs.start = {};
100   }
101 
102   BenchmarkSuspender& operator=(const BenchmarkSuspender&) = delete;
103   BenchmarkSuspender& operator=(BenchmarkSuspender&& rhs) {
104     if (start != TimePoint{}) {
105       tally();
106     }
107     start = rhs.start;
108     rhs.start = {};
109     return *this;
110   }
111 
~BenchmarkSuspenderBenchmarkSuspender112   ~BenchmarkSuspender() {
113     if (start != TimePoint{}) {
114       tally();
115     }
116   }
117 
dismissBenchmarkSuspender118   void dismiss() {
119     assert(start != TimePoint{});
120     tally();
121     start = {};
122   }
123 
rehireBenchmarkSuspender124   void rehire() {
125     assert(start == TimePoint{});
126     start = Clock::now();
127   }
128 
129   template <class F>
130   auto dismissing(F f) -> invoke_result_t<F> {
131     SCOPE_EXIT {
132       rehire();
133     };
134     dismiss();
135     return f();
136   }
137 
138   /**
139    * This is for use inside of if-conditions, used in BENCHMARK macros.
140    * If-conditions bypass the explicit on operator bool.
141    */
142   explicit operator bool() const {
143     return false;
144   }
145 
146   /**
147    * Accumulates time spent outside benchmark.
148    */
149   static Duration timeSpent;
150 
151  private:
tallyBenchmarkSuspender152   void tally() {
153     auto end = Clock::now();
154     timeSpent += end - start;
155     start = end;
156   }
157 
158   TimePoint start;
159 };
160 
161 /**
162  * Adds a benchmark. Usually not called directly but instead through
163  * the macro BENCHMARK defined below. The lambda function involved
164  * must take exactly one parameter of type unsigned, and the benchmark
165  * uses it with counter semantics (iteration occurs inside the
166  * function).
167  */
168 template <typename Lambda>
169 typename std::enable_if<
170     boost::function_types::function_arity<
171         decltype(&Lambda::operator())>::value == 2>::type
addBenchmark(const char * file,const char * name,Lambda && lambda)172 addBenchmark(const char* file, const char* name, Lambda&& lambda) {
173   auto execute = [=](unsigned int times) {
174     BenchmarkSuspender::timeSpent = {};
175     unsigned int niter;
176 
177     // CORE MEASUREMENT STARTS
178     auto start = std::chrono::high_resolution_clock::now();
179     niter = lambda(times);
180     auto end = std::chrono::high_resolution_clock::now();
181     // CORE MEASUREMENT ENDS
182 
183     return detail::TimeIterPair(
184         (end - start) - BenchmarkSuspender::timeSpent, niter);
185   };
186 
187   detail::addBenchmarkImpl(
188       file, name, std::function<detail::TimeIterPair(unsigned int)>(execute));
189 }
190 
191 /**
192  * Adds a benchmark. Usually not called directly but instead through
193  * the macro BENCHMARK defined below. The lambda function involved
194  * must take zero parameters, and the benchmark calls it repeatedly
195  * (iteration occurs outside the function).
196  */
197 template <typename Lambda>
198 typename std::enable_if<
199     boost::function_types::function_arity<
200         decltype(&Lambda::operator())>::value == 1>::type
addBenchmark(const char * file,const char * name,Lambda && lambda)201 addBenchmark(const char* file, const char* name, Lambda&& lambda) {
202   addBenchmark(file, name, [=](unsigned int times) {
203     unsigned int niter = 0;
204     while (times-- > 0) {
205       niter += lambda();
206     }
207     return niter;
208   });
209 }
210 
211 /**
212  * Call doNotOptimizeAway(var) to ensure that var will be computed even
213  * post-optimization.  Use it for variables that are computed during
214  * benchmarking but otherwise are useless. The compiler tends to do a
215  * good job at eliminating unused variables, and this function fools it
216  * into thinking var is in fact needed.
217  *
218  * Call makeUnpredictable(var) when you don't want the optimizer to use
219  * its knowledge of var to shape the following code.  This is useful
220  * when constant propagation or power reduction is possible during your
221  * benchmark but not in real use cases.
222  */
223 
224 #ifdef _MSC_VER
225 
226 #pragma optimize("", off)
227 
doNotOptimizeDependencySink(const void *)228 inline void doNotOptimizeDependencySink(const void*) {}
229 
230 #pragma optimize("", on)
231 
232 template <class T>
doNotOptimizeAway(const T & datum)233 void doNotOptimizeAway(const T& datum) {
234   doNotOptimizeDependencySink(&datum);
235 }
236 
237 template <typename T>
makeUnpredictable(T & datum)238 void makeUnpredictable(T& datum) {
239   doNotOptimizeDependencySink(&datum);
240 }
241 
242 #else
243 
244 namespace detail {
245 template <typename T>
246 struct DoNotOptimizeAwayNeedsIndirect {
247   using Decayed = typename std::decay<T>::type;
248 
249   // First two constraints ensure it can be an "r" operand.
250   // std::is_pointer check is because callers seem to expect that
251   // doNotOptimizeAway(&x) is equivalent to doNotOptimizeAway(x).
252   constexpr static bool value = !folly::is_trivially_copyable<Decayed>::value ||
253       sizeof(Decayed) > sizeof(long) || std::is_pointer<Decayed>::value;
254 };
255 } // namespace detail
256 
257 template <typename T>
258 auto doNotOptimizeAway(const T& datum) -> typename std::enable_if<
259     !detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
260   // The "r" constraint forces the compiler to make datum available
261   // in a register to the asm block, which means that it must have
262   // computed/loaded it.  We use this path for things that are <=
263   // sizeof(long) (they have to fit), trivial (otherwise the compiler
264   // doesn't want to put them in a register), and not a pointer (because
265   // doNotOptimizeAway(&foo) would otherwise be a foot gun that didn't
266   // necessarily compute foo).
267   //
268   // An earlier version of this method had a more permissive input operand
269   // constraint, but that caused unnecessary variation between clang and
270   // gcc benchmarks.
271   asm volatile("" ::"r"(datum));
272 }
273 
274 template <typename T>
275 auto doNotOptimizeAway(const T& datum) -> typename std::enable_if<
276     detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
277   // This version of doNotOptimizeAway tells the compiler that the asm
278   // block will read datum from memory, and that in addition it might read
279   // or write from any memory location.  If the memory clobber could be
280   // separated into input and output that would be preferrable.
281   asm volatile("" ::"m"(datum) : "memory");
282 }
283 
284 template <typename T>
285 auto makeUnpredictable(T& datum) -> typename std::enable_if<
286     !detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
287   asm volatile("" : "+r"(datum));
288 }
289 
290 template <typename T>
291 auto makeUnpredictable(T& datum) -> typename std::enable_if<
292     detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
293   asm volatile("" ::"m"(datum) : "memory");
294 }
295 
296 #endif
297 
298 struct dynamic;
299 
300 void benchmarkResultsToDynamic(
301     const std::vector<detail::BenchmarkResult>& data,
302     dynamic&);
303 
304 void benchmarkResultsFromDynamic(
305     const dynamic&,
306     std::vector<detail::BenchmarkResult>&);
307 
308 void printResultComparison(
309     const std::vector<detail::BenchmarkResult>& base,
310     const std::vector<detail::BenchmarkResult>& test);
311 
312 } // namespace folly
313 
314 /**
315  * Introduces a benchmark function. Used internally, see BENCHMARK and
316  * friends below.
317  */
318 #define BENCHMARK_IMPL(funName, stringName, rv, paramType, paramName) \
319   static void funName(paramType);                                     \
320   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) =           \
321       (::folly::addBenchmark(                                         \
322            __FILE__,                                                  \
323            stringName,                                                \
324            [](paramType paramName) -> unsigned {                      \
325              funName(paramName);                                      \
326              return rv;                                               \
327            }),                                                        \
328        true);                                                         \
329   static void funName(paramType paramName)
330 
331 /**
332  * Introduces a benchmark function with support for returning the actual
333  * number of iterations. Used internally, see BENCHMARK_MULTI and friends
334  * below.
335  */
336 #define BENCHMARK_MULTI_IMPL(funName, stringName, paramType, paramName) \
337   static unsigned funName(paramType);                                   \
338   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) =             \
339       (::folly::addBenchmark(                                           \
340            __FILE__,                                                    \
341            stringName,                                                  \
342            [](paramType paramName) { return funName(paramName); }),     \
343        true);                                                           \
344   static unsigned funName(paramType paramName)
345 
346 /**
347  * Introduces a benchmark function. Use with either one or two arguments.
348  * The first is the name of the benchmark. Use something descriptive, such
349  * as insertVectorBegin. The second argument may be missing, or could be a
350  * symbolic counter. The counter dictates how many internal iteration the
351  * benchmark does. Example:
352  *
353  * BENCHMARK(vectorPushBack) {
354  *   vector<int> v;
355  *   v.push_back(42);
356  * }
357  *
358  * BENCHMARK(insertVectorBegin, n) {
359  *   vector<int> v;
360  *   FOR_EACH_RANGE (i, 0, n) {
361  *     v.insert(v.begin(), 42);
362  *   }
363  * }
364  */
365 #define BENCHMARK(name, ...)                   \
366   BENCHMARK_IMPL(                              \
367       name,                                    \
368       FB_STRINGIZE(name),                      \
369       FB_ARG_2_OR_1(1, ##__VA_ARGS__),         \
370       FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
371       __VA_ARGS__)
372 
373 /**
374  * Like BENCHMARK above, but allows the user to return the actual
375  * number of iterations executed in the function body. This can be
376  * useful if the benchmark function doesn't know upfront how many
377  * iterations it's going to run or if it runs through a certain
378  * number of test cases, e.g.:
379  *
380  * BENCHMARK_MULTI(benchmarkSomething) {
381  *   std::vector<int> testCases { 0, 1, 1, 2, 3, 5 };
382  *   for (int c : testCases) {
383  *     doSomething(c);
384  *   }
385  *   return testCases.size();
386  * }
387  */
388 #define BENCHMARK_MULTI(name, ...)             \
389   BENCHMARK_MULTI_IMPL(                        \
390       name,                                    \
391       FB_STRINGIZE(name),                      \
392       FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
393       __VA_ARGS__)
394 
395 /**
396  * Defines a benchmark that passes a parameter to another one. This is
397  * common for benchmarks that need a "problem size" in addition to
398  * "number of iterations". Consider:
399  *
400  * void pushBack(uint32_t n, size_t initialSize) {
401  *   vector<int> v;
402  *   BENCHMARK_SUSPEND {
403  *     v.resize(initialSize);
404  *   }
405  *   FOR_EACH_RANGE (i, 0, n) {
406  *    v.push_back(i);
407  *   }
408  * }
409  * BENCHMARK_PARAM(pushBack, 0)
410  * BENCHMARK_PARAM(pushBack, 1000)
411  * BENCHMARK_PARAM(pushBack, 1000000)
412  *
413  * The benchmark above estimates the speed of push_back at different
414  * initial sizes of the vector. The framework will pass 0, 1000, and
415  * 1000000 for initialSize, and the iteration count for n.
416  */
417 #define BENCHMARK_PARAM(name, param) BENCHMARK_NAMED_PARAM(name, param, param)
418 
419 /**
420  * Same as BENCHMARK_PARAM, but allows one to return the actual number of
421  * iterations that have been run.
422  */
423 #define BENCHMARK_PARAM_MULTI(name, param) \
424   BENCHMARK_NAMED_PARAM_MULTI(name, param, param)
425 
426 /*
427  * Like BENCHMARK_PARAM(), but allows a custom name to be specified for each
428  * parameter, rather than using the parameter value.
429  *
430  * Useful when the parameter value is not a valid token for string pasting,
431  * of when you want to specify multiple parameter arguments.
432  *
433  * For example:
434  *
435  * void addValue(uint32_t n, int64_t bucketSize, int64_t min, int64_t max) {
436  *   Histogram<int64_t> hist(bucketSize, min, max);
437  *   int64_t num = min;
438  *   FOR_EACH_RANGE (i, 0, n) {
439  *     hist.addValue(num);
440  *     ++num;
441  *     if (num > max) { num = min; }
442  *   }
443  * }
444  *
445  * BENCHMARK_NAMED_PARAM(addValue, 0_to_100, 1, 0, 100)
446  * BENCHMARK_NAMED_PARAM(addValue, 0_to_1000, 10, 0, 1000)
447  * BENCHMARK_NAMED_PARAM(addValue, 5k_to_20k, 250, 5000, 20000)
448  */
449 #define BENCHMARK_NAMED_PARAM(name, param_name, ...)       \
450   BENCHMARK_IMPL(                                          \
451       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
452       FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
453       iters,                                               \
454       unsigned,                                            \
455       iters) {                                             \
456     name(iters, ##__VA_ARGS__);                            \
457   }
458 
459 /**
460  * Same as BENCHMARK_NAMED_PARAM, but allows one to return the actual number
461  * of iterations that have been run.
462  */
463 #define BENCHMARK_NAMED_PARAM_MULTI(name, param_name, ...) \
464   BENCHMARK_MULTI_IMPL(                                    \
465       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
466       FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
467       unsigned,                                            \
468       iters) {                                             \
469     return name(iters, ##__VA_ARGS__);                     \
470   }
471 
472 /**
473  * Just like BENCHMARK, but prints the time relative to a
474  * baseline. The baseline is the most recent BENCHMARK() seen in
475  * the current scope. Example:
476  *
477  * // This is the baseline
478  * BENCHMARK(insertVectorBegin, n) {
479  *   vector<int> v;
480  *   FOR_EACH_RANGE (i, 0, n) {
481  *     v.insert(v.begin(), 42);
482  *   }
483  * }
484  *
485  * BENCHMARK_RELATIVE(insertListBegin, n) {
486  *   list<int> s;
487  *   FOR_EACH_RANGE (i, 0, n) {
488  *     s.insert(s.begin(), 42);
489  *   }
490  * }
491  *
492  * Any number of relative benchmark can be associated with a
493  * baseline. Another BENCHMARK() occurrence effectively establishes a
494  * new baseline.
495  */
496 #define BENCHMARK_RELATIVE(name, ...)          \
497   BENCHMARK_IMPL(                              \
498       name,                                    \
499       "%" FB_STRINGIZE(name),                  \
500       FB_ARG_2_OR_1(1, ##__VA_ARGS__),         \
501       FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
502       __VA_ARGS__)
503 
504 /**
505  * Same as BENCHMARK_RELATIVE, but allows one to return the actual number
506  * of iterations that have been run.
507  */
508 #define BENCHMARK_RELATIVE_MULTI(name, ...)    \
509   BENCHMARK_MULTI_IMPL(                        \
510       name,                                    \
511       "%" FB_STRINGIZE(name),                  \
512       FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
513       __VA_ARGS__)
514 
515 /**
516  * A combination of BENCHMARK_RELATIVE and BENCHMARK_PARAM.
517  */
518 #define BENCHMARK_RELATIVE_PARAM(name, param) \
519   BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param)
520 
521 /**
522  * Same as BENCHMARK_RELATIVE_PARAM, but allows one to return the actual
523  * number of iterations that have been run.
524  */
525 #define BENCHMARK_RELATIVE_PARAM_MULTI(name, param) \
526   BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param, param)
527 
528 /**
529  * A combination of BENCHMARK_RELATIVE and BENCHMARK_NAMED_PARAM.
530  */
531 #define BENCHMARK_RELATIVE_NAMED_PARAM(name, param_name, ...)  \
532   BENCHMARK_IMPL(                                              \
533       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),     \
534       "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
535       iters,                                                   \
536       unsigned,                                                \
537       iters) {                                                 \
538     name(iters, ##__VA_ARGS__);                                \
539   }
540 
541 /**
542  * Same as BENCHMARK_RELATIVE_NAMED_PARAM, but allows one to return the
543  * actual number of iterations that have been run.
544  */
545 #define BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param_name, ...) \
546   BENCHMARK_MULTI_IMPL(                                             \
547       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),          \
548       "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",      \
549       unsigned,                                                     \
550       iters) {                                                      \
551     return name(iters, ##__VA_ARGS__);                              \
552   }
553 
554 /**
555  * Draws a line of dashes.
556  */
557 #define BENCHMARK_DRAW_LINE()                                                \
558   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) =                  \
559       (::folly::addBenchmark(__FILE__, "-", []() -> unsigned { return 0; }), \
560        true)
561 
562 /**
563  * Allows execution of code that doesn't count torward the benchmark's
564  * time budget. Example:
565  *
566  * BENCHMARK_START_GROUP(insertVectorBegin, n) {
567  *   vector<int> v;
568  *   BENCHMARK_SUSPEND {
569  *     v.reserve(n);
570  *   }
571  *   FOR_EACH_RANGE (i, 0, n) {
572  *     v.insert(v.begin(), 42);
573  *   }
574  * }
575  */
576 #define BENCHMARK_SUSPEND                             \
577   if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) = \
578           ::folly::BenchmarkSuspender()) {            \
579   } else
580