1 //  __   _ _______ __   _  _____  ______  _______ __   _ _______ _     _
2 //  | \  | |_____| | \  | |     | |_____] |______ | \  | |       |_____|
3 //  |  \_| |     | |  \_| |_____| |_____] |______ |  \_| |_____  |     |
4 //
5 // Microbenchmark framework for C++11/14/17/20
6 // https://github.com/martinus/nanobench
7 //
8 // Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9 // SPDX-License-Identifier: MIT
10 // Copyright (c) 2019-2020 Martin Ankerl <martin.ankerl@gmail.com>
11 //
12 // Permission is hereby granted, free of charge, to any person obtaining a copy
13 // of this software and associated documentation files (the "Software"), to deal
14 // in the Software without restriction, including without limitation the rights
15 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 // copies of the Software, and to permit persons to whom the Software is
17 // furnished to do so, subject to the following conditions:
18 //
19 // The above copyright notice and this permission notice shall be included in all
20 // copies or substantial portions of the Software.
21 //
22 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 // SOFTWARE.
29 
30 #ifndef ANKERL_NANOBENCH_H_INCLUDED
31 #define ANKERL_NANOBENCH_H_INCLUDED
32 
33 // see https://semver.org/
34 #define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35 #define ANKERL_NANOBENCH_VERSION_MINOR 2 // backwards-compatible changes
36 #define ANKERL_NANOBENCH_VERSION_PATCH 0 // backwards-compatible bug fixes
37 
38 ///////////////////////////////////////////////////////////////////////////////////////////////////
39 // public facing api - as minimal as possible
40 ///////////////////////////////////////////////////////////////////////////////////////////////////
41 
42 #include <chrono>  // high_resolution_clock
43 #include <cstring> // memcpy
44 #include <iosfwd>  // for std::ostream* custom output target in Config
45 #include <string>  // all names
46 #include <vector>  // holds all results
47 
48 #define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
49 
50 #define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
51 #define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
52 #define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
53 #define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
54 #define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
55 
56 #if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
57 #    define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
58 #else
59 #    define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
60 #endif
61 
62 #if defined(__clang__)
63 #    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
64         _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
65 #    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
66 #else
67 #    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
68 #    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
69 #endif
70 
71 #if defined(__GNUC__)
72 #    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
73 #    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
74 #else
75 #    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
76 #    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
77 #endif
78 
79 #if defined(ANKERL_NANOBENCH_LOG_ENABLED)
80 #    include <iostream>
81 #    define ANKERL_NANOBENCH_LOG(x)                                                 \
82         do {                                                                        \
83             std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
84         } while (0)
85 #else
86 #    define ANKERL_NANOBENCH_LOG(x) \
87         do {                        \
88         } while (0)
89 #endif
90 
91 #if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
92 #    define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
93 #else
94 #    define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
95 #endif
96 
97 #if defined(__clang__)
98 #    define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
99 #else
100 #    define ANKERL_NANOBENCH_NO_SANITIZE(...)
101 #endif
102 
103 #if defined(_MSC_VER)
104 #    define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
105 #else
106 #    define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
107 #endif
108 
109 // workaround missing "is_trivially_copyable" in g++ < 5.0
110 // See https://stackoverflow.com/a/31798726/48181
111 #if defined(__GNUC__) && __GNUC__ < 5
112 #    define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
113 #else
114 #    define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
115 #endif
116 
117 // declarations ///////////////////////////////////////////////////////////////////////////////////
118 
119 namespace ankerl {
120 namespace nanobench {
121 
122 using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
123                                std::chrono::steady_clock>::type;
124 class Bench;
125 struct Config;
126 class Result;
127 class Rng;
128 class BigO;
129 
130 /**
131  * @brief Renders output from a mustache-like template and benchmark results.
132  *
133  * The templating facility here is heavily inspired by [mustache - logic-less templates](https://mustache.github.io/).
134  * It adds a few more features that are necessary to get all of the captured data out of nanobench. Please read the
135  * excellent [mustache manual](https://mustache.github.io/mustache.5.html) to see what this is all about.
136  *
137  * nanobench output has two nested layers, *result* and *measurement*.  Here is a hierarchy of the allowed tags:
138  *
139  * * `{{#result}}` Marks the begin of the result layer. Whatever comes after this will be instantiated as often as
140  *   a benchmark result is available. Within it, you can use these tags:
141  *
142  *    * `{{title}}` See Bench::title().
143  *
144  *    * `{{name}}` Benchmark name, usually directly provided with Bench::run(), but can also be set with Bench::name().
145  *
146  *    * `{{unit}}` Unit, e.g. `byte`. Defaults to `op`, see Bench::title().
147  *
148  *    * `{{batch}}` Batch size, see Bench::batch().
149  *
150  *    * `{{complexityN}}` Value used for asymptotic complexity calculation. See Bench::complexityN().
151  *
152  *    * `{{epochs}}` Number of epochs, see Bench::epochs().
153  *
154  *    * `{{clockResolution}}` Accuracy of the clock, i.e. what's the smallest time possible to measure with the clock.
155  *      For modern systems, this can be around 20 ns. This value is automatically determined by nanobench at the first
156  *      benchmark that is run, and used as a static variable throughout the application's runtime.
157  *
158  *    * `{{clockResolutionMultiple}}` Configuration multiplier for `clockResolution`. See Bench::clockResolutionMultiple().
159  *      This is the target runtime for each measurement (epoch). That means the more accurate your clock is, the faster
160  *      will be the benchmark. Basing the measurement's runtime on the clock resolution is the main reason why nanobench is so fast.
161  *
162  *    * `{{maxEpochTime}}` Configuration for a maximum time each measurement (epoch) is allowed to take. Note that at least
163  *      a single iteration will be performed, even when that takes longer than maxEpochTime. See Bench::maxEpochTime().
164  *
165  *    * `{{minEpochTime}}` Minimum epoch time, usually not set. See Bench::minEpochTime().
166  *
167  *    * `{{minEpochIterations}}` See Bench::minEpochIterations().
168  *
169  *    * `{{epochIterations}}` See Bench::epochIterations().
170  *
171  *    * `{{warmup}}` Number of iterations used before measuring starts. See Bench::warmup().
172  *
173  *    * `{{relative}}` True or false, depending on the setting you have used. See Bench::relative().
174  *
175  *    Apart from these tags, it is also possible to use some mathematical operations on the measurement data. The operations
176  *    are of the form `{{command(name)}}`.  Currently `name` can be one of `elapsed`, `iterations`. If performance counters
177  *    are available (currently only on current Linux systems), you also have `pagefaults`, `cpucycles`,
178  *    `contextswitches`, `instructions`, `branchinstructions`, and `branchmisses`. All the measuers (except `iterations`) are
179  *    provided for a single iteration (so `elapsed` is the time a single iteration took). The following tags are available:
180  *
181  *    * `{{median(<name>)}}` Calculate median of a measurement data set, e.g. `{{median(elapsed)}}`.
182  *
183  *    * `{{average(<name>)}}` Average (mean) calculation.
184  *
185  *    * `{{medianAbsolutePercentError(<name>)}}` Calculates MdAPE, the Median Absolute Percentage Error. The MdAPE is an excellent
186  *      metric for the variation of measurements. It is more robust to outliers than the
187  *      [Mean absolute percentage error (M-APE)](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error).
188  *      @f[
189  *       \mathrm{MdAPE}(e) = \mathrm{med}\{| \frac{e_i - \mathrm{med}\{e\}}{e_i}| \}
190  *      @f]
191  *      E.g. for *elapsed*: First, @f$ \mathrm{med}\{e\} @f$ calculates the median by sorting and then taking the middle element
192  *      of all *elapsed* measurements. This is used to calculate the absolute percentage
193  *      error to this median for each measurement, as in  @f$ | \frac{e_i - \mathrm{med}\{e\}}{e_i}| @f$. All these results
194  *      are sorted, and the middle value is chosen as the median absolute percent error.
195  *
196  *      This measurement is a bit hard to interpret, but it is very robust against outliers. E.g. a value of 5% means that half of the
197  *      measurements deviate less than 5% from the median, and the other deviate more than 5% from the median.
198  *
199  *    * `{{sum(<name>)}}` Sums of all the measurements. E.g. `{{sum(iterations)}}` will give you the total number of iterations
200 *        measured in this benchmark.
201  *
202  *    * `{{minimum(<name>)}}` Minimum of all measurements.
203  *
204  *    * `{{maximum(<name>)}}` Maximum of all measurements.
205  *
206  *    * `{{sumProduct(<first>, <second>)}}` Calculates the sum of the products of corresponding measures:
207  *      @f[
208  *          \mathrm{sumProduct}(a,b) = \sum_{i=1}^{n}a_i\cdot b_i
209  *      @f]
210  *      E.g. to calculate total runtime of the benchmark, you multiply iterations with elapsed time for each measurement, and
211  *      sum these results up:
212  *      `{{sumProduct(iterations, elapsed)}}`.
213  *
214  *    * `{{#measurement}}` To access individual measurement results, open the begin tag for measurements.
215  *
216  *       * `{{elapsed}}` Average elapsed wall clock time per iteration, in seconds.
217  *
218  *       * `{{iterations}}` Number of iterations in the measurement. The number of iterations will fluctuate due
219  *         to some applied randomness, to enhance accuracy.
220  *
221  *       * `{{pagefaults}}` Average number of pagefaults per iteration.
222  *
223  *       * `{{cpucycles}}` Average number of CPU cycles processed per iteration.
224  *
225  *       * `{{contextswitches}}` Average number of context switches per iteration.
226  *
227  *       * `{{instructions}}` Average number of retired instructions per iteration.
228  *
229  *       * `{{branchinstructions}}` Average number of branches executed per iteration.
230  *
231  *       * `{{branchmisses}}` Average number of branches that were missed per iteration.
232  *
233  *    * `{{/measurement}}` Ends the measurement tag.
234  *
235  * * `{{/result}}` Marks the end of the result layer. This is the end marker for the template part that will be instantiated
236  *   for each benchmark result.
237  *
238  *
239  *  For the layer tags *result* and *measurement* you additionally can use these special markers:
240  *
241  *  * ``{{#-first}}`` - Begin marker of a template that will be instantiated *only for the first* entry in the layer. Use is only
242  *    allowed between the begin and end marker of the layer allowed. So between ``{{#result}}`` and ``{{/result}}``, or between
243  *    ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-first}}``.
244  *
245  *  * ``{{^-first}}`` - Begin marker of a template that will be instantiated *for each except the first* entry in the layer. This,
246  *    this is basically the inversion of ``{{#-first}}``. Use is only allowed between the begin and end marker of the layer allowed.
247  *    So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``.
248  *
249  *  * ``{{/-first}}`` - End marker for either ``{{#-first}}`` or ``{{^-first}}``.
250  *
251  *  * ``{{#-last}}`` - Begin marker of a template that will be instantiated *only for the last* entry in the layer. Use is only
252  *    allowed between the begin and end marker of the layer allowed. So between ``{{#result}}`` and ``{{/result}}``, or between
253  *    ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-last}}``.
254  *
255  *  * ``{{^-last}}`` - Begin marker of a template that will be instantiated *for each except the last* entry in the layer. This,
256  *    this is basically the inversion of ``{{#-last}}``. Use is only allowed between the begin and end marker of the layer allowed.
257  *    So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``.
258  *
259  *  * ``{{/-last}}`` - End marker for either ``{{#-last}}`` or ``{{^-last}}``.
260  *
261    @verbatim embed:rst
262 
263    For an overview of all the possible data you can get out of nanobench, please see the tutorial at :ref:`tutorial-template-json`.
264 
265    The templates that ship with nanobench are:
266 
267    * :cpp:func:`templates::csv() <ankerl::nanobench::templates::csv()>`
268    * :cpp:func:`templates::json() <ankerl::nanobench::templates::json()>`
269    * :cpp:func:`templates::htmlBoxplot() <ankerl::nanobench::templates::htmlBoxplot()>`
270    * :cpp:func:`templates::pyperf() <ankerl::nanobench::templates::pyperf()>`
271 
272    @endverbatim
273  *
274  * @param mustacheTemplate The template.
275  * @param bench Benchmark, containing all the results.
276  * @param out Output for the generated output.
277  */
278 void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
279 void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
280 
281 /**
282  * Same as render(char const* mustacheTemplate, Bench const& bench, std::ostream& out), but for when
283  * you only have results available.
284  *
285  * @param mustacheTemplate The template.
286  * @param results All the results to be used for rendering.
287  * @param out Output for the generated output.
288  */
289 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
290 void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
291 
292 // Contains mustache-like templates
293 namespace templates {
294 
295 /*!
296   @brief CSV data for the benchmark results.
297 
298   Generates a comma-separated values dataset. First line is the header, each following line is a summary of each benchmark run.
299 
300   @verbatim embed:rst
301   See the tutorial at :ref:`tutorial-template-csv` for an example.
302   @endverbatim
303  */
304 char const* csv() noexcept;
305 
306 /*!
307   @brief HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an example output.
308 
309   The output uses only the elapsed wall clock time, and displays each epoch as a single dot.
310   @verbatim embed:rst
311   See the tutorial at :ref:`tutorial-template-html` for an example.
312   @endverbatim
313 
314   @see ankerl::nanobench::render()
315  */
316 char const* htmlBoxplot() noexcept;
317 
318 /*!
319  @brief Output in pyperf  compatible JSON format, which can be used for more analyzations.
320  @verbatim embed:rst
321  See the tutorial at :ref:`tutorial-template-pyperf` for an example how to further analyze the output.
322  @endverbatim
323  */
324 char const* pyperf() noexcept;
325 
326 /*!
327   @brief Template to generate JSON data.
328 
329   The generated JSON data contains *all* data that has been generated. All times are as double values, in seconds. The output can get
330   quite large.
331   @verbatim embed:rst
332   See the tutorial at :ref:`tutorial-template-json` for an example.
333   @endverbatim
334  */
335 char const* json() noexcept;
336 
337 } // namespace templates
338 
339 namespace detail {
340 
341 template <typename T>
342 struct PerfCountSet;
343 
344 class IterationLogic;
345 class PerformanceCounters;
346 
347 #if ANKERL_NANOBENCH(PERF_COUNTERS)
348 class LinuxPerformanceCounters;
349 #endif
350 
351 } // namespace detail
352 } // namespace nanobench
353 } // namespace ankerl
354 
355 // definitions ////////////////////////////////////////////////////////////////////////////////////
356 
357 namespace ankerl {
358 namespace nanobench {
359 namespace detail {
360 
361 template <typename T>
362 struct PerfCountSet {
363     T pageFaults{};
364     T cpuCycles{};
365     T contextSwitches{};
366     T instructions{};
367     T branchInstructions{};
368     T branchMisses{};
369 };
370 
371 } // namespace detail
372 
373 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
374 struct Config {
375     // actual benchmark config
376     std::string mBenchmarkTitle = "benchmark";
377     std::string mBenchmarkName = "noname";
378     std::string mUnit = "op";
379     double mBatch = 1.0;
380     double mComplexityN = -1.0;
381     size_t mNumEpochs = 11;
382     size_t mClockResolutionMultiple = static_cast<size_t>(1000);
383     std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
384     std::chrono::nanoseconds mMinEpochTime{};
385     uint64_t mMinEpochIterations{1};
386     uint64_t mEpochIterations{0}; // If not 0, run *exactly* these number of iterations per epoch.
387     uint64_t mWarmup = 0;
388     std::ostream* mOut = nullptr;
389     std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1};
390     std::string mTimeUnitName = "ns";
391     bool mShowPerformanceCounters = true;
392     bool mIsRelative = false;
393 
394     Config();
395     ~Config();
396     Config& operator=(Config const&);
397     Config& operator=(Config&&);
398     Config(Config const&);
399     Config(Config&&) noexcept;
400 };
401 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
402 
403 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)404 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
405 class Result {
406 public:
407     enum class Measure : size_t {
408         elapsed,
409         iterations,
410         pagefaults,
411         cpucycles,
412         contextswitches,
413         instructions,
414         branchinstructions,
415         branchmisses,
416         _size
417     };
418 
419     explicit Result(Config const& benchmarkConfig);
420 
421     ~Result();
422     Result& operator=(Result const&);
423     Result& operator=(Result&&);
424     Result(Result const&);
425     Result(Result&&) noexcept;
426 
427     // adds new measurement results
428     // all values are scaled by iters (except iters...)
429     void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
430 
431     ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
432 
433     ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
434     ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
435     ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
436     ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
437     ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
438     ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
439     ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
440 
441     ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
442     ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
443     ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
444     ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
445 
446     // Finds string, if not found, returns _size.
447     static Measure fromString(std::string const& str);
448 
449 private:
450     Config mConfig{};
451     std::vector<std::vector<double>> mNameToMeasurements{};
452 };
ANKERL_NANOBENCH(IGNORE_PADDED_POP)453 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
454 
455 /**
456  * An extremely fast random generator. Currently, this implements *RomuDuoJr*, developed by Mark Overton. Source:
457  * http://www.romu-random.org/
458  *
459  * RomuDuoJr is extremely fast and provides reasonable good randomness. Not enough for large jobs, but definitely
460  * good enough for a benchmarking framework.
461  *
462  *  * Estimated capacity: @f$ 2^{51} @f$ bytes
463  *  * Register pressure: 4
464  *  * State size: 128 bits
465  *
466  * This random generator is a drop-in replacement for the generators supplied by ``<random>``. It is not
467  * cryptographically secure. It's intended purpose is to be very fast so that benchmarks that make use
468  * of randomness are not distorted too much by the random generator.
469  *
470  * Rng also provides a few non-standard helpers, optimized for speed.
471  */
472 class Rng final {
473 public:
474     /**
475      * @brief This RNG provides 64bit randomness.
476      */
477     using result_type = uint64_t;
478 
479     static constexpr uint64_t(min)();
480     static constexpr uint64_t(max)();
481 
482     /**
483      * As a safety precausion, we don't allow copying. Copying a PRNG would mean you would have two random generators that produce the
484      * same sequence, which is generally not what one wants. Instead create a new rng with the default constructor Rng(), which is
485      * automatically seeded from `std::random_device`. If you really need a copy, use copy().
486      */
487     Rng(Rng const&) = delete;
488 
489     /**
490      * Same as Rng(Rng const&), we don't allow assignment. If you need a new Rng create one with the default constructor Rng().
491      */
492     Rng& operator=(Rng const&) = delete;
493 
494     // moving is ok
495     Rng(Rng&&) noexcept = default;
496     Rng& operator=(Rng&&) noexcept = default;
497     ~Rng() noexcept = default;
498 
499     /**
500      * @brief Creates a new Random generator with random seed.
501      *
502      * Instead of a default seed (as the random generators from the STD), this properly seeds the random generator from
503      * `std::random_device`. It guarantees correct seeding. Note that seeding can be relatively slow, depending on the source of
504      * randomness used. So it is best to create a Rng once and use it for all your randomness purposes.
505      */
506     Rng();
507 
508     /*!
509       Creates a new Rng that is seeded with a specific seed. Each Rng created from the same seed will produce the same randomness
510       sequence. This can be useful for deterministic behavior.
511 
512       @verbatim embed:rst
513       .. note::
514 
515          The random algorithm might change between nanobench releases. Whenever a faster and/or better random
516          generator becomes available, I will switch the implementation.
517       @endverbatim
518 
519       As per the Romu paper, this seeds the Rng with splitMix64 algorithm and performs 10 initial rounds for further mixing up of the
520       internal state.
521 
522       @param seed  The 64bit seed. All values are allowed, even 0.
523      */
524     explicit Rng(uint64_t seed) noexcept;
525     Rng(uint64_t x, uint64_t y) noexcept;
526 
527     /**
528      * Creates a copy of the Rng, thus the copy provides exactly the same random sequence as the original.
529      */
530     ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
531 
532     /**
533      * @brief Produces a 64bit random value. This should be very fast, thus it is marked as inline. In my benchmark, this is ~46 times
534      * faster than `std::default_random_engine` for producing 64bit random values. It seems that the fastest std contender is
535      * `std::mt19937_64`. Still, this RNG is 2-3 times as fast.
536      *
537      * @return uint64_t The next 64 bit random value.
538      */
539     inline uint64_t operator()() noexcept;
540 
541     // This is slightly biased. See
542 
543     /**
544      * Generates a random number between 0 and range (excluding range).
545      *
546      * The algorithm only produces 32bit numbers, and is slightly biased. The effect is quite small unless your range is close to the
547      * maximum value of an integer. It is possible to correct the bias with rejection sampling (see
548      * [here](https://lemire.me/blog/2016/06/30/fast-random-shuffling/), but this is most likely irrelevant in practices for the
549      * purposes of this Rng.
550      *
551      * See Daniel Lemire's blog post [A fast alternative to the modulo
552      * reduction](https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/)
553      *
554      * @param range Upper exclusive range. E.g a value of 3 will generate random numbers 0, 1, 2.
555      * @return uint32_t Generated random values in range [0, range(.
556      */
557     inline uint32_t bounded(uint32_t range) noexcept;
558 
559     // random double in range [0, 1(
560     // see http://prng.di.unimi.it/
561 
562     /**
563      * Provides a random uniform double value between 0 and 1. This uses the method described in [Generating uniform doubles in the
564      * unit interval](http://prng.di.unimi.it/), and is extremely fast.
565      *
566      * @return double Uniformly distributed double value in range [0,1(, excluding 1.
567      */
568     inline double uniform01() noexcept;
569 
570     /**
571      * Shuffles all entries in the given container. Although this has a slight bias due to the implementation of bounded(), this is
572      * preferable to `std::shuffle` because it is over 5 times faster. See Daniel Lemire's blog post [Fast random
573      * shuffling](https://lemire.me/blog/2016/06/30/fast-random-shuffling/).
574      *
575      * @param container The whole container will be shuffled.
576      */
577     template <typename Container>
578     void shuffle(Container& container) noexcept;
579 
580 private:
581     static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
582 
583     uint64_t mX;
584     uint64_t mY;
585 };
586 
587 /**
588  * @brief Main entry point to nanobench's benchmarking facility.
589  *
590  * It holds configuration and results from one or more benchmark runs. Usually it is used in a single line, where the object is
591  * constructed, configured, and then a benchmark is run. E.g. like this:
592  *
593  *     ankerl::nanobench::Bench().unit("byte").batch(1000).run("random fluctuations", [&] {
594  *         // here be the benchmark code
595  *     });
596  *
597  * In that example Bench() constructs the benchmark, it is then configured with unit() and batch(), and after configuration a
598  * benchmark is executed with run(). Once run() has finished, it prints the result to `std::cout`. It would also store the results
599  * in the Bench instance, but in this case the object is immediately destroyed so it's not available any more.
600  */
ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)601 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
602 class Bench {
603 public:
604     /**
605      * @brief Creates a new benchmark for configuration and running of benchmarks.
606      */
607     Bench();
608 
609     Bench(Bench&& other);
610     Bench& operator=(Bench&& other);
611     Bench(Bench const& other);
612     Bench& operator=(Bench const& other);
613     ~Bench() noexcept;
614 
615     /*!
616       @brief Repeatedly calls `op()` based on the configuration, and performs measurements.
617 
618       This call is marked with `noinline` to prevent the compiler to optimize beyond different benchmarks. This can have quite a big
619       effect on benchmark accuracy.
620 
621       @verbatim embed:rst
622       .. note::
623 
624         Each call to your lambda must have a side effect that the compiler can't possibly optimize it away. E.g. add a result to an
625         externally defined number (like `x` in the above example), and finally call `doNotOptimizeAway` on the variables the compiler
626         must not remove. You can also use :cpp:func:`ankerl::nanobench::doNotOptimizeAway` directly in the lambda, but be aware that
627         this has a small overhead.
628 
629       @endverbatim
630 
631       @tparam Op The code to benchmark.
632      */
633     template <typename Op>
634     ANKERL_NANOBENCH(NOINLINE)
635     Bench& run(char const* benchmarkName, Op&& op);
636 
637     template <typename Op>
638     ANKERL_NANOBENCH(NOINLINE)
639     Bench& run(std::string const& benchmarkName, Op&& op);
640 
641     /**
642      * @brief Same as run(char const* benchmarkName, Op op), but instead uses the previously set name.
643      * @tparam Op The code to benchmark.
644      */
645     template <typename Op>
646     ANKERL_NANOBENCH(NOINLINE)
647     Bench& run(Op&& op);
648 
649     /**
650      * @brief Title of the benchmark, will be shown in the table header. Changing the title will start a new markdown table.
651      *
652      * @param benchmarkTitle The title of the benchmark.
653      */
654     Bench& title(char const* benchmarkTitle);
655     Bench& title(std::string const& benchmarkTitle);
656     ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
657 
658     /// Name of the benchmark, will be shown in the table row.
659     Bench& name(char const* benchmarkName);
660     Bench& name(std::string const& benchmarkName);
661     ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
662 
663     /**
664      * @brief Sets the batch size.
665      *
666      * E.g. number of processed byte, or some other metric for the size of the processed data in each iteration. If you benchmark
667      * hashing of a 1000 byte long string and want byte/sec as a result, you can specify 1000 as the batch size.
668      *
669      * @tparam T Any input type is internally cast to `double`.
670      * @param b batch size
671      */
672     template <typename T>
673     Bench& batch(T b) noexcept;
674     ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
675 
676     /**
677      * @brief Sets the operation unit.
678      *
679      * Defaults to "op". Could be e.g. "byte" for string processing. This is used for the table header, e.g. to show `ns/byte`. Use
680      * singular (*byte*, not *bytes*). A change clears the currently collected results.
681      *
682      * @param unit The unit name.
683      */
684     Bench& unit(char const* unit);
685     Bench& unit(std::string const& unit);
686     ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
687 
688     /**
689      * @brief Sets the time unit to be used for the default output.
690      *
691      * Nanobench defaults to using ns (nanoseconds) as output in the markdown. For some benchmarks this is too coarse, so it is
692      * possible to configure this. E.g. use `timeUnit(1ms, "ms")` to show `ms/op` instead of `ns/op`.
693      *
694      * @param tu Time unit to display the results in, default is 1ns.
695      * @param tuName Name for the time unit, default is "ns"
696      */
697     Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
698     ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
699     ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
700 
701     /**
702      * @brief Set the output stream where the resulting markdown table will be printed to.
703      *
704      * The default is `&std::cout`. You can disable all output by setting `nullptr`.
705      *
706      * @param outstream Pointer to output stream, can be `nullptr`.
707      */
708     Bench& output(std::ostream* outstream) noexcept;
709     ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
710 
711     /**
712      * Modern processors have a very accurate clock, being able to measure as low as 20 nanoseconds. This is the main trick nanobech to
713      * be so fast: we find out how accurate the clock is, then run the benchmark only so often that the clock's accuracy is good enough
714      * for accurate measurements.
715      *
716      * The default is to run one epoch for 1000 times the clock resolution. So for 20ns resolution and 11 epochs, this gives a total
717      * runtime of
718      *
719      * @f[
720      * 20ns * 1000 * 11 \approx 0.2ms
721      * @f]
722      *
723      * To be precise, nanobench adds a 0-20% random noise to each evaluation. This is to prevent any aliasing effects, and further
724      * improves accuracy.
725      *
726      * Total runtime will be higher though: Some initial time is needed to find out the target number of iterations for each epoch, and
727      * there is some overhead involved to start & stop timers and calculate resulting statistics and writing the output.
728      *
729      * @param multiple Target number of times of clock resolution. Usually 1000 is a good compromise between runtime and accuracy.
730      */
731     Bench& clockResolutionMultiple(size_t multiple) noexcept;
732     ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
733 
734     /**
735      * @brief Controls number of epochs, the number of measurements to perform.
736      *
737      * The reported result will be the median of evaluation of each epoch. The higher you choose this, the more
738      * deterministic the result be and outliers will be more easily removed. Also the `err%` will be more accurate the higher this
739      * number is. Note that the `err%` will not necessarily decrease when number of epochs is increased. But it will be a more accurate
740      * representation of the benchmarked code's runtime stability.
741      *
742      * Choose the value wisely. In practice, 11 has been shown to be a reasonable choice between runtime performance and accuracy.
743      * This setting goes hand in hand with minEpocIterations() (or minEpochTime()). If you are more interested in *median* runtime, you
744      * might want to increase epochs(). If you are more interested in *mean* runtime, you might want to increase minEpochIterations()
745      * instead.
746      *
747      * @param numEpochs Number of epochs.
748      */
749     Bench& epochs(size_t numEpochs) noexcept;
750     ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
751 
752     /**
753      * @brief Upper limit for the runtime of each epoch.
754      *
755      * As a safety precausion if the clock is not very accurate, we can set an upper limit for the maximum evaluation time per
756      * epoch. Default is 100ms. At least a single evaluation of the benchmark is performed.
757      *
758      * @see minEpochTime(), minEpochIterations()
759      *
760      * @param t Maximum target runtime for a single epoch.
761      */
762     Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
763     ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
764 
765     /**
766      * @brief Minimum time each epoch should take.
767      *
768      * Default is zero, so we are fully relying on clockResolutionMultiple(). In most cases this is exactly what you want. If you see
769      * that the evaluation is unreliable with a high `err%`, you can increase either minEpochTime() or minEpochIterations().
770      *
771      * @see maxEpochTime(), minEpochIterations()
772      *
773      * @param t Minimum time each epoch should take.
774      */
775     Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
776     ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
777 
778     /**
779      * @brief Sets the minimum number of iterations each epoch should take.
780      *
781      * Default is 1, and we rely on clockResolutionMultiple(). If the `err%` is high and you want a more smooth result, you might want
782      * to increase the minimum number or iterations, or increase the minEpochTime().
783      *
784      * @see minEpochTime(), maxEpochTime(), minEpochIterations()
785      *
786      * @param numIters Minimum number of iterations per epoch.
787      */
788     Bench& minEpochIterations(uint64_t numIters) noexcept;
789     ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
790 
791     /**
792      * Sets exactly the number of iterations for each epoch. Ignores all other epoch limits. This forces nanobench to use exactly
793      * the given number of iterations for each epoch, not more and not less. Default is 0 (disabled).
794      *
795      * @param numIters Exact number of iterations to use. Set to 0 to disable.
796      */
797     Bench& epochIterations(uint64_t numIters) noexcept;
798     ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
799 
800     /**
801      * @brief Sets a number of iterations that are initially performed without any measurements.
802      *
803      * Some benchmarks need a few evaluations to warm up caches / database / whatever access. Normally this should not be needed, since
804      * we show the median result so initial outliers will be filtered away automatically. If the warmup effect is large though, you
805      * might want to set it. Default is 0.
806      *
807      * @param numWarmupIters Number of warmup iterations.
808      */
809     Bench& warmup(uint64_t numWarmupIters) noexcept;
810     ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
811 
812     /**
813      * @brief Marks the next run as the baseline.
814      *
815      * Call `relative(true)` to mark the run as the baseline. Successive runs will be compared to this run. It is calculated by
816      *
817      * @f[
818      * 100\% * \frac{baseline}{runtime}
819      * @f]
820      *
821      *  * 100% means it is exactly as fast as the baseline
822      *  * >100% means it is faster than the baseline. E.g. 200% means the current run is twice as fast as the baseline.
823      *  * <100% means it is slower than the baseline. E.g. 50% means it is twice as slow as the baseline.
824      *
825      * See the tutorial section "Comparing Results" for example usage.
826      *
827      * @param isRelativeEnabled True to enable processing
828      */
829     Bench& relative(bool isRelativeEnabled) noexcept;
830     ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
831 
832     /**
833      * @brief Enables/disables performance counters.
834      *
835      * On Linux nanobench has a powerful feature to use performance counters. This enables counting of retired instructions, count
836      * number of branches, missed branches, etc. On default this is enabled, but you can disable it if you don't need that feature.
837      *
838      * @param showPerformanceCounters True to enable, false to disable.
839      */
840     Bench& performanceCounters(bool showPerformanceCounters) noexcept;
841     ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
842 
843     /**
844      * @brief Retrieves all benchmark results collected by the bench object so far.
845      *
846      * Each call to run() generates a Result that is stored within the Bench instance. This is mostly for advanced users who want to
847      * see all the nitty gritty detials.
848      *
849      * @return All results collected so far.
850      */
851     ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
852 
853     /*!
854       @verbatim embed:rst
855 
856       Convenience shortcut to :cpp:func:`ankerl::nanobench::doNotOptimizeAway`.
857 
858       @endverbatim
859      */
860     template <typename Arg>
861     Bench& doNotOptimizeAway(Arg&& arg);
862 
863     /*!
864       @verbatim embed:rst
865 
866       Sets N for asymptotic complexity calculation, so it becomes possible to calculate `Big O
867       <https://en.wikipedia.org/wiki/Big_O_notation>`_ from multiple benchmark evaluations.
868 
869       Use :cpp:func:`ankerl::nanobench::Bench::complexityBigO` when the evaluation has finished. See the tutorial
870       :ref:`asymptotic-complexity` for details.
871 
872       @endverbatim
873 
874       @tparam T Any type is cast to `double`.
875       @param b Length of N for the next benchmark run, so it is possible to calculate `bigO`.
876      */
877     template <typename T>
878     Bench& complexityN(T b) noexcept;
879     ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
880 
881     /*!
882       Calculates [Big O](https://en.wikipedia.org/wiki/Big_O_notation>) of the results with all preconfigured complexity functions.
883       Currently these complexity functions are fitted into the benchmark results:
884 
885        @f$ \mathcal{O}(1) @f$,
886        @f$ \mathcal{O}(n) @f$,
887        @f$ \mathcal{O}(\log{}n) @f$,
888        @f$ \mathcal{O}(n\log{}n) @f$,
889        @f$ \mathcal{O}(n^2) @f$,
890        @f$ \mathcal{O}(n^3) @f$.
891 
892       If we e.g. evaluate the complexity of `std::sort`, this is the result of `std::cout << bench.complexityBigO()`:
893 
894       ```
895       |   coefficient |   err% | complexity
896       |--------------:|-------:|------------
897       |   5.08935e-09 |   2.6% | O(n log n)
898       |   6.10608e-08 |   8.0% | O(n)
899       |   1.29307e-11 |  47.2% | O(n^2)
900       |   2.48677e-15 |  69.6% | O(n^3)
901       |   9.88133e-06 | 132.3% | O(log n)
902       |   5.98793e-05 | 162.5% | O(1)
903       ```
904 
905       So in this case @f$ \mathcal{O}(n\log{}n) @f$ provides the best approximation.
906 
907       @verbatim embed:rst
908       See the tutorial :ref:`asymptotic-complexity` for details.
909       @endverbatim
910       @return Evaluation results, which can be printed or otherwise inspected.
911      */
912     std::vector<BigO> complexityBigO() const;
913 
914     /**
915      * @brief Calculates bigO for a custom function.
916      *
917      * E.g. to calculate the mean squared error for @f$ \mathcal{O}(\log{}\log{}n) @f$, which is not part of the default set of
918      * complexityBigO(), you can do this:
919      *
920      * ```
921      * auto logLogN = bench.complexityBigO("O(log log n)", [](double n) {
922      *     return std::log2(std::log2(n));
923      * });
924      * ```
925      *
926      * The resulting mean squared error can be printed with `std::cout << logLogN`. E.g. it prints something like this:
927      *
928      * ```text
929      * 2.46985e-05 * O(log log n), rms=1.48121
930      * ```
931      *
932      * @tparam Op Type of mapping operation.
933      * @param name Name for the function, e.g. "O(log log n)"
934      * @param op Op's operator() maps a `double` with the desired complexity function, e.g. `log2(log2(n))`.
935      * @return BigO Error calculation, which is streamable to std::cout.
936      */
937     template <typename Op>
938     BigO complexityBigO(char const* name, Op op) const;
939 
940     template <typename Op>
941     BigO complexityBigO(std::string const& name, Op op) const;
942 
943     /*!
944       @verbatim embed:rst
945 
946       Convenience shortcut to :cpp:func:`ankerl::nanobench::render`.
947 
948       @endverbatim
949      */
950     Bench& render(char const* templateContent, std::ostream& os);
951     Bench& render(std::string const& templateContent, std::ostream& os);
952 
953     Bench& config(Config const& benchmarkConfig);
954     ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
955 
956 private:
957     Config mConfig{};
958     std::vector<Result> mResults{};
959 };
960 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
961 
962 /**
963  * @brief Makes sure none of the given arguments are optimized away by the compiler.
964  *
965  * @tparam Arg Type of the argument that shouldn't be optimized away.
966  * @param arg The input that we mark as being used, even though we don't do anything with it.
967  */
968 template <typename Arg>
969 void doNotOptimizeAway(Arg&& arg);
970 
971 namespace detail {
972 
973 #if defined(_MSC_VER)
974 void doNotOptimizeAwaySink(void const*);
975 
976 template <typename T>
977 void doNotOptimizeAway(T const& val);
978 
979 #else
980 
981 // These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
982 // this seemd to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
983 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
984 template <typename T>
985 void doNotOptimizeAway(T const& val) {
986     // NOLINTNEXTLINE(hicpp-no-assembler)
987     asm volatile("" : : "r,m"(val) : "memory");
988 }
989 
990 template <typename T>
991 void doNotOptimizeAway(T& val) {
992 #    if defined(__clang__)
993     // NOLINTNEXTLINE(hicpp-no-assembler)
994     asm volatile("" : "+r,m"(val) : : "memory");
995 #    else
996     // NOLINTNEXTLINE(hicpp-no-assembler)
997     asm volatile("" : "+m,r"(val) : : "memory");
998 #    endif
999 }
1000 #endif
1001 
1002 // internally used, but visible because run() is templated.
1003 // Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
1004 // having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)1005 ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
1006 class IterationLogic {
1007 public:
1008     explicit IterationLogic(Bench const& config) noexcept;
1009     ~IterationLogic();
1010 
1011     ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
1012     void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
1013     void moveResultTo(std::vector<Result>& results) noexcept;
1014 
1015 private:
1016     struct Impl;
1017     Impl* mPimpl;
1018 };
1019 ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
1020 
ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)1021 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1022 class PerformanceCounters {
1023 public:
1024     PerformanceCounters(PerformanceCounters const&) = delete;
1025     PerformanceCounters& operator=(PerformanceCounters const&) = delete;
1026 
1027     PerformanceCounters();
1028     ~PerformanceCounters();
1029 
1030     void beginMeasure();
1031     void endMeasure();
1032     void updateResults(uint64_t numIters);
1033 
1034     ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1035     ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1036 
1037 private:
1038 #if ANKERL_NANOBENCH(PERF_COUNTERS)
1039     LinuxPerformanceCounters* mPc = nullptr;
1040 #endif
1041     PerfCountSet<uint64_t> mVal{};
1042     PerfCountSet<bool> mHas{};
1043 };
1044 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1045 
1046 // Gets the singleton
1047 PerformanceCounters& performanceCounters();
1048 
1049 } // namespace detail
1050 
1051 class BigO {
1052 public:
1053     using RangeMeasure = std::vector<std::pair<double, double>>;
1054 
1055     template <typename Op>
mapRangeMeasure(RangeMeasure data,Op op)1056     static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op) {
1057         for (auto& rangeMeasure : data) {
1058             rangeMeasure.first = op(rangeMeasure.first);
1059         }
1060         return data;
1061     }
1062 
1063     static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1064 
1065     template <typename Op>
BigO(char const * bigOName,RangeMeasure const & rangeMeasure,Op rangeToN)1066     BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1067         : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1068 
1069     template <typename Op>
BigO(std::string const & bigOName,RangeMeasure const & rangeMeasure,Op rangeToN)1070     BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1071         : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1072 
1073     BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1074     BigO(std::string const& bigOName, RangeMeasure const& scaledRangeMeasure);
1075     ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1076     ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1077     ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1078     ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1079 
1080 private:
1081     std::string mName{};
1082     double mConstant{};
1083     double mNormalizedRootMeanSquare{};
1084 };
1085 std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1086 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1087 
1088 } // namespace nanobench
1089 } // namespace ankerl
1090 
1091 // implementation /////////////////////////////////////////////////////////////////////////////////
1092 
1093 namespace ankerl {
1094 namespace nanobench {
1095 
uint64_t(Rng::min)1096 constexpr uint64_t(Rng::min)() {
1097     return 0;
1098 }
1099 
uint64_t(Rng::max)1100 constexpr uint64_t(Rng::max)() {
1101     return (std::numeric_limits<uint64_t>::max)();
1102 }
1103 
1104 ANKERL_NANOBENCH_NO_SANITIZE("integer")
operator()1105 uint64_t Rng::operator()() noexcept {
1106     auto x = mX;
1107 
1108     mX = UINT64_C(15241094284759029579) * mY;
1109     mY = rotl(mY - x, 27);
1110 
1111     return x;
1112 }
1113 
1114 ANKERL_NANOBENCH_NO_SANITIZE("integer")
bounded(uint32_t range)1115 uint32_t Rng::bounded(uint32_t range) noexcept {
1116     uint64_t r32 = static_cast<uint32_t>(operator()());
1117     auto multiresult = r32 * range;
1118     return static_cast<uint32_t>(multiresult >> 32U);
1119 }
1120 
uniform01()1121 double Rng::uniform01() noexcept {
1122     auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1123     // can't use union in c++ here for type puning, it's undefined behavior.
1124     // std::memcpy is optimized anyways.
1125     double d;
1126     std::memcpy(&d, &i, sizeof(double));
1127     return d - 1.0;
1128 }
1129 
1130 template <typename Container>
shuffle(Container & container)1131 void Rng::shuffle(Container& container) noexcept {
1132     auto size = static_cast<uint32_t>(container.size());
1133     for (auto i = size; i > 1U; --i) {
1134         using std::swap;
1135         auto p = bounded(i); // number in [0, i)
1136         swap(container[i - 1], container[p]);
1137     }
1138 }
1139 
rotl(uint64_t x,unsigned k)1140 constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1141     return (x << k) | (x >> (64U - k));
1142 }
1143 
1144 template <typename Op>
1145 ANKERL_NANOBENCH_NO_SANITIZE("integer")
run(Op && op)1146 Bench& Bench::run(Op&& op) {
1147     // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1148     detail::IterationLogic iterationLogic(*this);
1149     auto& pc = detail::performanceCounters();
1150 
1151     while (auto n = iterationLogic.numIters()) {
1152         pc.beginMeasure();
1153         Clock::time_point before = Clock::now();
1154         while (n-- > 0) {
1155             op();
1156         }
1157         Clock::time_point after = Clock::now();
1158         pc.endMeasure();
1159         pc.updateResults(iterationLogic.numIters());
1160         iterationLogic.add(after - before, pc);
1161     }
1162     iterationLogic.moveResultTo(mResults);
1163     return *this;
1164 }
1165 
1166 // Performs all evaluations.
1167 template <typename Op>
run(char const * benchmarkName,Op && op)1168 Bench& Bench::run(char const* benchmarkName, Op&& op) {
1169     name(benchmarkName);
1170     return run(std::forward<Op>(op));
1171 }
1172 
1173 template <typename Op>
run(std::string const & benchmarkName,Op && op)1174 Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1175     name(benchmarkName);
1176     return run(std::forward<Op>(op));
1177 }
1178 
1179 template <typename Op>
complexityBigO(char const * benchmarkName,Op op)1180 BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1181     return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1182 }
1183 
1184 template <typename Op>
complexityBigO(std::string const & benchmarkName,Op op)1185 BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1186     return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1187 }
1188 
1189 // Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1190 // Any argument is cast to double.
1191 template <typename T>
batch(T b)1192 Bench& Bench::batch(T b) noexcept {
1193     mConfig.mBatch = static_cast<double>(b);
1194     return *this;
1195 }
1196 
1197 // Sets the computation complexity of the next run. Any argument is cast to double.
1198 template <typename T>
complexityN(T n)1199 Bench& Bench::complexityN(T n) noexcept {
1200     mConfig.mComplexityN = static_cast<double>(n);
1201     return *this;
1202 }
1203 
1204 // Convenience: makes sure none of the given arguments are optimized away by the compiler.
1205 template <typename Arg>
doNotOptimizeAway(Arg && arg)1206 Bench& Bench::doNotOptimizeAway(Arg&& arg) {
1207     detail::doNotOptimizeAway(std::forward<Arg>(arg));
1208     return *this;
1209 }
1210 
1211 // Makes sure none of the given arguments are optimized away by the compiler.
1212 template <typename Arg>
doNotOptimizeAway(Arg && arg)1213 void doNotOptimizeAway(Arg&& arg) {
1214     detail::doNotOptimizeAway(std::forward<Arg>(arg));
1215 }
1216 
1217 namespace detail {
1218 
1219 #if defined(_MSC_VER)
1220 template <typename T>
doNotOptimizeAway(T const & val)1221 void doNotOptimizeAway(T const& val) {
1222     doNotOptimizeAwaySink(&val);
1223 }
1224 
1225 #endif
1226 
1227 } // namespace detail
1228 } // namespace nanobench
1229 } // namespace ankerl
1230 
1231 #if defined(ANKERL_NANOBENCH_IMPLEMENT)
1232 
1233 ///////////////////////////////////////////////////////////////////////////////////////////////////
1234 // implementation part - only visible in .cpp
1235 ///////////////////////////////////////////////////////////////////////////////////////////////////
1236 
1237 #    include <algorithm> // sort, reverse
1238 #    include <atomic>    // compare_exchange_strong in loop overhead
1239 #    include <cstdlib>   // getenv
1240 #    include <cstring>   // strstr, strncmp
1241 #    include <fstream>   // ifstream to parse proc files
1242 #    include <iomanip>   // setw, setprecision
1243 #    include <iostream>  // cout
1244 #    include <numeric>   // accumulate
1245 #    include <random>    // random_device
1246 #    include <sstream>   // to_s in Number
1247 #    include <stdexcept> // throw for rendering templates
1248 #    include <tuple>     // std::tie
1249 #    if defined(__linux__)
1250 #        include <unistd.h> //sysconf
1251 #    endif
1252 #    if ANKERL_NANOBENCH(PERF_COUNTERS)
1253 #        include <map> // map
1254 
1255 #        include <linux/perf_event.h>
1256 #        include <sys/ioctl.h>
1257 #        include <sys/syscall.h>
1258 #        include <unistd.h>
1259 #    endif
1260 
1261 // declarations ///////////////////////////////////////////////////////////////////////////////////
1262 
1263 namespace ankerl {
1264 namespace nanobench {
1265 
1266 // helper stuff that is only intended to be used internally
1267 namespace detail {
1268 
1269 struct TableInfo;
1270 
1271 // formatting utilities
1272 namespace fmt {
1273 
1274 class NumSep;
1275 class StreamStateRestorer;
1276 class Number;
1277 class MarkDownColumn;
1278 class MarkDownCode;
1279 
1280 } // namespace fmt
1281 } // namespace detail
1282 } // namespace nanobench
1283 } // namespace ankerl
1284 
1285 // definitions ////////////////////////////////////////////////////////////////////////////////////
1286 
1287 namespace ankerl {
1288 namespace nanobench {
1289 
1290 uint64_t splitMix64(uint64_t& state) noexcept;
1291 
1292 namespace detail {
1293 
1294 // helpers to get double values
1295 template <typename T>
d(T t)1296 inline double d(T t) noexcept {
1297     return static_cast<double>(t);
1298 }
d(Clock::duration duration)1299 inline double d(Clock::duration duration) noexcept {
1300     return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1301 }
1302 
1303 // Calculates clock resolution once, and remembers the result
1304 inline Clock::duration clockResolution() noexcept;
1305 
1306 } // namespace detail
1307 
1308 namespace templates {
1309 
csv()1310 char const* csv() noexcept {
1311     return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1312 {{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1313 {{/result}})DELIM";
1314 }
1315 
htmlBoxplot()1316 char const* htmlBoxplot() noexcept {
1317     return R"DELIM(<html>
1318 
1319 <head>
1320     <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1321 </head>
1322 
1323 <body>
1324     <div id="myDiv"></div>
1325     <script>
1326         var data = [
1327             {{#result}}{
1328                 name: '{{name}}',
1329                 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1330             },
1331             {{/result}}
1332         ];
1333         var title = '{{title}}';
1334 
1335         data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1336         var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1337     </script>
1338 </body>
1339 
1340 </html>)DELIM";
1341 }
1342 
1343 char const* pyperf() noexcept {
1344     return R"DELIM({
1345     "benchmarks": [
1346         {
1347             "runs": [
1348                 {
1349                     "values": [
1350 {{#measurement}}                        {{elapsed}}{{^-last}},
1351 {{/last}}{{/measurement}}
1352                     ]
1353                 }
1354             ]
1355         }
1356     ],
1357     "metadata": {
1358         "loops": {{sum(iterations)}},
1359         "inner_loops": {{batch}},
1360         "name": "{{title}}",
1361         "unit": "second"
1362     },
1363     "version": "1.0"
1364 })DELIM";
1365 }
1366 
1367 char const* json() noexcept {
1368     return R"DELIM({
1369     "results": [
1370 {{#result}}        {
1371             "title": "{{title}}",
1372             "name": "{{name}}",
1373             "unit": "{{unit}}",
1374             "batch": {{batch}},
1375             "complexityN": {{complexityN}},
1376             "epochs": {{epochs}},
1377             "clockResolution": {{clockResolution}},
1378             "clockResolutionMultiple": {{clockResolutionMultiple}},
1379             "maxEpochTime": {{maxEpochTime}},
1380             "minEpochTime": {{minEpochTime}},
1381             "minEpochIterations": {{minEpochIterations}},
1382             "epochIterations": {{epochIterations}},
1383             "warmup": {{warmup}},
1384             "relative": {{relative}},
1385             "median(elapsed)": {{median(elapsed)}},
1386             "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1387             "median(instructions)": {{median(instructions)}},
1388             "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1389             "median(cpucycles)": {{median(cpucycles)}},
1390             "median(contextswitches)": {{median(contextswitches)}},
1391             "median(pagefaults)": {{median(pagefaults)}},
1392             "median(branchinstructions)": {{median(branchinstructions)}},
1393             "median(branchmisses)": {{median(branchmisses)}},
1394             "totalTime": {{sumProduct(iterations, elapsed)}},
1395             "measurements": [
1396 {{#measurement}}                {
1397                     "iterations": {{iterations}},
1398                     "elapsed": {{elapsed}},
1399                     "pagefaults": {{pagefaults}},
1400                     "cpucycles": {{cpucycles}},
1401                     "contextswitches": {{contextswitches}},
1402                     "instructions": {{instructions}},
1403                     "branchinstructions": {{branchinstructions}},
1404                     "branchmisses": {{branchmisses}}
1405                 }{{^-last}},{{/-last}}
1406 {{/measurement}}            ]
1407         }{{^-last}},{{/-last}}
1408 {{/result}}    ]
1409 })DELIM";
1410 }
1411 
1412 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1413 struct Node {
1414     enum class Type { tag, content, section, inverted_section };
1415 
1416     char const* begin;
1417     char const* end;
1418     std::vector<Node> children;
1419     Type type;
1420 
1421     template <size_t N>
1422     // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1423     bool operator==(char const (&str)[N]) const noexcept {
1424         return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1425     }
1426 };
1427 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1428 
1429 static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1430     std::vector<Node> nodes;
1431 
1432     while (true) {
1433         auto begin = std::strstr(*tpl, "{{");
1434         auto end = begin;
1435         if (begin != nullptr) {
1436             begin += 2;
1437             end = std::strstr(begin, "}}");
1438         }
1439 
1440         if (begin == nullptr || end == nullptr) {
1441             // nothing found, finish node
1442             nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1443             return nodes;
1444         }
1445 
1446         nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1447 
1448         // we found a tag
1449         *tpl = end + 2;
1450         switch (*begin) {
1451         case '/':
1452             // finished! bail out
1453             return nodes;
1454 
1455         case '#':
1456             nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1457             break;
1458 
1459         case '^':
1460             nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1461             break;
1462 
1463         default:
1464             nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1465             break;
1466         }
1467     }
1468 }
1469 
1470 static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1471     ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1472     bool matchFirst = n == "-first";
1473     bool matchLast = n == "-last";
1474     if (!matchFirst && !matchLast) {
1475         return false;
1476     }
1477 
1478     bool doWrite = false;
1479     if (n.type == Node::Type::section) {
1480         doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1481     } else if (n.type == Node::Type::inverted_section) {
1482         doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1483     }
1484 
1485     if (doWrite) {
1486         for (auto const& child : n.children) {
1487             if (child.type == Node::Type::content) {
1488                 out.write(child.begin, std::distance(child.begin, child.end));
1489             }
1490         }
1491     }
1492     return true;
1493 }
1494 
1495 static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1496     matchResult.clear();
1497     auto idxOpen = str.find('(');
1498     auto idxClose = str.find(')', idxOpen);
1499     if (idxClose == std::string::npos) {
1500         return false;
1501     }
1502 
1503     matchResult.emplace_back(str.substr(0, idxOpen));
1504 
1505     // split by comma
1506     matchResult.emplace_back(std::string{});
1507     for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1508         if (str[i] == ' ' || str[i] == '\t') {
1509             // skip whitespace
1510             continue;
1511         }
1512         if (str[i] == ',') {
1513             // got a comma => new string
1514             matchResult.emplace_back(std::string{});
1515             continue;
1516         }
1517         // no whitespace no comma, append
1518         matchResult.back() += str[i];
1519     }
1520     return true;
1521 }
1522 
1523 static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1524     using detail::d;
1525 
1526     if (n == "title") {
1527         out << config.mBenchmarkTitle;
1528         return true;
1529     } else if (n == "name") {
1530         out << config.mBenchmarkName;
1531         return true;
1532     } else if (n == "unit") {
1533         out << config.mUnit;
1534         return true;
1535     } else if (n == "batch") {
1536         out << config.mBatch;
1537         return true;
1538     } else if (n == "complexityN") {
1539         out << config.mComplexityN;
1540         return true;
1541     } else if (n == "epochs") {
1542         out << config.mNumEpochs;
1543         return true;
1544     } else if (n == "clockResolution") {
1545         out << d(detail::clockResolution());
1546         return true;
1547     } else if (n == "clockResolutionMultiple") {
1548         out << config.mClockResolutionMultiple;
1549         return true;
1550     } else if (n == "maxEpochTime") {
1551         out << d(config.mMaxEpochTime);
1552         return true;
1553     } else if (n == "minEpochTime") {
1554         out << d(config.mMinEpochTime);
1555         return true;
1556     } else if (n == "minEpochIterations") {
1557         out << config.mMinEpochIterations;
1558         return true;
1559     } else if (n == "epochIterations") {
1560         out << config.mEpochIterations;
1561         return true;
1562     } else if (n == "warmup") {
1563         out << config.mWarmup;
1564         return true;
1565     } else if (n == "relative") {
1566         out << config.mIsRelative;
1567         return true;
1568     }
1569     return false;
1570 }
1571 
1572 static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1573     if (generateConfigTag(n, r.config(), out)) {
1574         return out;
1575     }
1576     // match e.g. "median(elapsed)"
1577     // g++ 4.8 doesn't implement std::regex :(
1578     // static std::regex const regOpArg1("^([a-zA-Z]+)\\(([a-zA-Z]*)\\)$");
1579     // std::cmatch matchResult;
1580     // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1581     std::vector<std::string> matchResult;
1582     if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1583         if (matchResult.size() == 2) {
1584             auto m = Result::fromString(matchResult[1]);
1585             if (m == Result::Measure::_size) {
1586                 return out << 0.0;
1587             }
1588 
1589             if (matchResult[0] == "median") {
1590                 return out << r.median(m);
1591             }
1592             if (matchResult[0] == "average") {
1593                 return out << r.average(m);
1594             }
1595             if (matchResult[0] == "medianAbsolutePercentError") {
1596                 return out << r.medianAbsolutePercentError(m);
1597             }
1598             if (matchResult[0] == "sum") {
1599                 return out << r.sum(m);
1600             }
1601             if (matchResult[0] == "minimum") {
1602                 return out << r.minimum(m);
1603             }
1604             if (matchResult[0] == "maximum") {
1605                 return out << r.maximum(m);
1606             }
1607         } else if (matchResult.size() == 3) {
1608             auto m1 = Result::fromString(matchResult[1]);
1609             auto m2 = Result::fromString(matchResult[2]);
1610             if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) {
1611                 return out << 0.0;
1612             }
1613 
1614             if (matchResult[0] == "sumProduct") {
1615                 return out << r.sumProduct(m1, m2);
1616             }
1617         }
1618     }
1619 
1620     // match e.g. "sumProduct(elapsed, iterations)"
1621     // static std::regex const regOpArg2("^([a-zA-Z]+)\\(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\)$");
1622 
1623     // nothing matches :(
1624     throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1625 }
1626 
1627 static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1628     for (auto const& n : nodes) {
1629         if (!generateFirstLast(n, idx, r.size(), out)) {
1630             ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1631             switch (n.type) {
1632             case Node::Type::content:
1633                 out.write(n.begin, std::distance(n.begin, n.end));
1634                 break;
1635 
1636             case Node::Type::inverted_section:
1637                 throw std::runtime_error("got a inverted section inside measurement");
1638 
1639             case Node::Type::section:
1640                 throw std::runtime_error("got a section inside measurement");
1641 
1642             case Node::Type::tag: {
1643                 auto m = Result::fromString(std::string(n.begin, n.end));
1644                 if (m == Result::Measure::_size || !r.has(m)) {
1645                     out << 0.0;
1646                 } else {
1647                     out << r.get(idx, m);
1648                 }
1649                 break;
1650             }
1651             }
1652         }
1653     }
1654 }
1655 
1656 static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1657     auto const& r = results[idx];
1658     for (auto const& n : nodes) {
1659         if (!generateFirstLast(n, idx, results.size(), out)) {
1660             ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1661             switch (n.type) {
1662             case Node::Type::content:
1663                 out.write(n.begin, std::distance(n.begin, n.end));
1664                 break;
1665 
1666             case Node::Type::inverted_section:
1667                 throw std::runtime_error("got a inverted section inside result");
1668 
1669             case Node::Type::section:
1670                 if (n == "measurement") {
1671                     for (size_t i = 0; i < r.size(); ++i) {
1672                         generateResultMeasurement(n.children, i, r, out);
1673                     }
1674                 } else {
1675                     throw std::runtime_error("got a section inside result");
1676                 }
1677                 break;
1678 
1679             case Node::Type::tag:
1680                 generateResultTag(n, r, out);
1681                 break;
1682             }
1683         }
1684     }
1685 }
1686 
1687 } // namespace templates
1688 
1689 // helper stuff that only intended to be used internally
1690 namespace detail {
1691 
1692 char const* getEnv(char const* name);
1693 bool isEndlessRunning(std::string const& name);
1694 bool isWarningsEnabled();
1695 
1696 template <typename T>
1697 T parseFile(std::string const& filename);
1698 
1699 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1700 void printStabilityInformationOnce(std::ostream* os);
1701 
1702 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1703 uint64_t& singletonHeaderHash() noexcept;
1704 
1705 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1706 Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1707 
1708 // formatting utilities
1709 namespace fmt {
1710 
1711 // adds thousands separator to numbers
1712 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1713 class NumSep : public std::numpunct<char> {
1714 public:
1715     explicit NumSep(char sep);
1716     char do_thousands_sep() const override;
1717     std::string do_grouping() const override;
1718 
1719 private:
1720     char mSep;
1721 };
1722 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1723 
1724 // RAII to save & restore a stream's state
1725 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1726 class StreamStateRestorer {
1727 public:
1728     explicit StreamStateRestorer(std::ostream& s);
1729     ~StreamStateRestorer();
1730 
1731     // sets back all stream info that we remembered at construction
1732     void restore();
1733 
1734     // don't allow copying / moving
1735     StreamStateRestorer(StreamStateRestorer const&) = delete;
1736     StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1737     StreamStateRestorer(StreamStateRestorer&&) = delete;
1738     StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1739 
1740 private:
1741     std::ostream& mStream;
1742     std::locale mLocale;
1743     std::streamsize const mPrecision;
1744     std::streamsize const mWidth;
1745     std::ostream::char_type const mFill;
1746     std::ostream::fmtflags const mFmtFlags;
1747 };
1748 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1749 
1750 // Number formatter
1751 class Number {
1752 public:
1753     Number(int width, int precision, double value);
1754     Number(int width, int precision, int64_t value);
1755     std::string to_s() const;
1756 
1757 private:
1758     friend std::ostream& operator<<(std::ostream& os, Number const& n);
1759     std::ostream& write(std::ostream& os) const;
1760 
1761     int mWidth;
1762     int mPrecision;
1763     double mValue;
1764 };
1765 
1766 // helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1767 std::string to_s(uint64_t s);
1768 
1769 std::ostream& operator<<(std::ostream& os, Number const& n);
1770 
1771 class MarkDownColumn {
1772 public:
1773     MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val);
1774     std::string title() const;
1775     std::string separator() const;
1776     std::string invalid() const;
1777     std::string value() const;
1778 
1779 private:
1780     int mWidth;
1781     int mPrecision;
1782     std::string mTitle;
1783     std::string mSuffix;
1784     double mValue;
1785 };
1786 
1787 // Formats any text as markdown code, escaping backticks.
1788 class MarkDownCode {
1789 public:
1790     explicit MarkDownCode(std::string const& what);
1791 
1792 private:
1793     friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1794     std::ostream& write(std::ostream& os) const;
1795 
1796     std::string mWhat{};
1797 };
1798 
1799 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1800 
1801 } // namespace fmt
1802 } // namespace detail
1803 } // namespace nanobench
1804 } // namespace ankerl
1805 
1806 // implementation /////////////////////////////////////////////////////////////////////////////////
1807 
1808 namespace ankerl {
1809 namespace nanobench {
1810 
1811 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1812     detail::fmt::StreamStateRestorer restorer(out);
1813 
1814     out.precision(std::numeric_limits<double>::digits10);
1815     auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1816 
1817     for (auto const& n : nodes) {
1818         ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1819         switch (n.type) {
1820         case templates::Node::Type::content:
1821             out.write(n.begin, std::distance(n.begin, n.end));
1822             break;
1823 
1824         case templates::Node::Type::inverted_section:
1825             throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
1826 
1827         case templates::Node::Type::section:
1828             if (n == "result") {
1829                 const size_t nbResults = results.size();
1830                 for (size_t i = 0; i < nbResults; ++i) {
1831                     generateResult(n.children, i, results, out);
1832                 }
1833             } else if (n == "measurement") {
1834                 if (results.size() != 1) {
1835                     throw std::runtime_error(
1836                         "render: can only use section 'measurement' here if there is a single result, but there are " +
1837                         detail::fmt::to_s(results.size()));
1838                 }
1839                 // when we only have a single result, we can immediately go into its measurement.
1840                 auto const& r = results.front();
1841                 for (size_t i = 0; i < r.size(); ++i) {
1842                     generateResultMeasurement(n.children, i, r, out);
1843                 }
1844             } else {
1845                 throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
1846             }
1847             break;
1848 
1849         case templates::Node::Type::tag:
1850             if (results.size() == 1) {
1851                 // result & config are both supported there
1852                 generateResultTag(n, results.front(), out);
1853             } else {
1854                 // This just uses the last result's config.
1855                 if (!generateConfigTag(n, results.back().config(), out)) {
1856                     throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
1857                 }
1858             }
1859             break;
1860         }
1861     }
1862 }
1863 
1864 void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1865     render(mustacheTemplate.c_str(), results, out);
1866 }
1867 
1868 void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
1869     render(mustacheTemplate, bench.results(), out);
1870 }
1871 
1872 void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
1873     render(mustacheTemplate.c_str(), bench.results(), out);
1874 }
1875 
1876 namespace detail {
1877 
1878 PerformanceCounters& performanceCounters() {
1879 #    if defined(__clang__)
1880 #        pragma clang diagnostic push
1881 #        pragma clang diagnostic ignored "-Wexit-time-destructors"
1882 #    endif
1883     static PerformanceCounters pc;
1884 #    if defined(__clang__)
1885 #        pragma clang diagnostic pop
1886 #    endif
1887     return pc;
1888 }
1889 
1890 // Windows version of doNotOptimizeAway
1891 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
1892 // see https://github.com/facebook/folly/blob/master/folly/Benchmark.h#L280
1893 // see https://docs.microsoft.com/en-us/cpp/preprocessor/optimize
1894 #    if defined(_MSC_VER)
1895 #        pragma optimize("", off)
1896 void doNotOptimizeAwaySink(void const*) {}
1897 #        pragma optimize("", on)
1898 #    endif
1899 
1900 template <typename T>
1901 T parseFile(std::string const& filename) {
1902     std::ifstream fin(filename);
1903     T num{};
1904     fin >> num;
1905     return num;
1906 }
1907 
1908 char const* getEnv(char const* name) {
1909 #    if defined(_MSC_VER)
1910 #        pragma warning(push)
1911 #        pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
1912 #    endif
1913     return std::getenv(name);
1914 #    if defined(_MSC_VER)
1915 #        pragma warning(pop)
1916 #    endif
1917 }
1918 
1919 bool isEndlessRunning(std::string const& name) {
1920     auto endless = getEnv("NANOBENCH_ENDLESS");
1921     return nullptr != endless && endless == name;
1922 }
1923 
1924 // True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
1925 bool isWarningsEnabled() {
1926     auto suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
1927     return nullptr == suppression || suppression == std::string("0");
1928 }
1929 
1930 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
1931     warnings.clear();
1932     recommendations.clear();
1933 
1934     bool recommendCheckFlags = false;
1935 
1936 #    if defined(DEBUG)
1937     warnings.emplace_back("DEBUG defined");
1938     recommendCheckFlags = true;
1939 #    endif
1940 
1941     bool recommendPyPerf = false;
1942 #    if defined(__linux__)
1943     auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
1944     if (nprocs <= 0) {
1945         warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
1946     } else {
1947 
1948         // check frequency scaling
1949         for (long id = 0; id < nprocs; ++id) {
1950             auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
1951             auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
1952             auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq");
1953             auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq");
1954             if (minFreq != maxFreq) {
1955                 auto minMHz = static_cast<double>(minFreq) / 1000.0;
1956                 auto maxMHz = static_cast<double>(maxFreq) / 1000.0;
1957                 warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
1958                                       detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
1959                                       " MHz");
1960                 recommendPyPerf = true;
1961                 break;
1962             }
1963         }
1964 
1965         auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
1966         if ("performance" != currentGovernor) {
1967             warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
1968             recommendPyPerf = true;
1969         }
1970 
1971         if (0 == parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo")) {
1972             warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
1973             recommendPyPerf = true;
1974         }
1975     }
1976 #    endif
1977 
1978     if (recommendCheckFlags) {
1979         recommendations.emplace_back("Make sure you compile for Release");
1980     }
1981     if (recommendPyPerf) {
1982         recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
1983     }
1984 }
1985 
1986 void printStabilityInformationOnce(std::ostream* outStream) {
1987     static bool shouldPrint = true;
1988     if (shouldPrint && outStream && isWarningsEnabled()) {
1989         auto& os = *outStream;
1990         shouldPrint = false;
1991         std::vector<std::string> warnings;
1992         std::vector<std::string> recommendations;
1993         gatherStabilityInformation(warnings, recommendations);
1994         if (warnings.empty()) {
1995             return;
1996         }
1997 
1998         os << "Warning, results might be unstable:" << std::endl;
1999         for (auto const& w : warnings) {
2000             os << "* " << w << std::endl;
2001         }
2002 
2003         os << std::endl << "Recommendations" << std::endl;
2004         for (auto const& r : recommendations) {
2005             os << "* " << r << std::endl;
2006         }
2007     }
2008 }
2009 
2010 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
2011 uint64_t& singletonHeaderHash() noexcept {
2012     static uint64_t sHeaderHash{};
2013     return sHeaderHash;
2014 }
2015 
2016 ANKERL_NANOBENCH_NO_SANITIZE("integer")
2017 inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2018     return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2019 }
2020 
2021 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
2022 Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
2023     auto bestDuration = Clock::duration::max();
2024     Clock::time_point tBegin;
2025     Clock::time_point tEnd;
2026     for (size_t i = 0; i < numEvaluations; ++i) {
2027         tBegin = Clock::now();
2028         do {
2029             tEnd = Clock::now();
2030         } while (tBegin == tEnd);
2031         bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2032     }
2033     return bestDuration;
2034 }
2035 
2036 // Calculates clock resolution once, and remembers the result
2037 Clock::duration clockResolution() noexcept {
2038     static Clock::duration sResolution = calcClockResolution(20);
2039     return sResolution;
2040 }
2041 
2042 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2043 struct IterationLogic::Impl {
2044     enum class State { warmup, upscaling_runtime, measuring, endless };
2045 
2046     explicit Impl(Bench const& bench)
2047         : mBench(bench)
2048         , mResult(bench.config()) {
2049         printStabilityInformationOnce(mBench.output());
2050 
2051         // determine target runtime per epoch
2052         mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2053         if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2054             mTargetRuntimePerEpoch = mBench.maxEpochTime();
2055         }
2056         if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2057             mTargetRuntimePerEpoch = mBench.minEpochTime();
2058         }
2059 
2060         if (isEndlessRunning(mBench.name())) {
2061             std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
2062             mNumIters = (std::numeric_limits<uint64_t>::max)();
2063             mState = State::endless;
2064         } else if (0 != mBench.warmup()) {
2065             mNumIters = mBench.warmup();
2066             mState = State::warmup;
2067         } else if (0 != mBench.epochIterations()) {
2068             // exact number of iterations
2069             mNumIters = mBench.epochIterations();
2070             mState = State::measuring;
2071         } else {
2072             mNumIters = mBench.minEpochIterations();
2073             mState = State::upscaling_runtime;
2074         }
2075     }
2076 
2077     // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
2078     ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2079         auto doubleElapsed = d(elapsed);
2080         auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2081         auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2082 
2083         auto doubleMinEpochIters = d(mBench.minEpochIterations());
2084         if (doubleNewIters < doubleMinEpochIters) {
2085             doubleNewIters = doubleMinEpochIters;
2086         }
2087         doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2088 
2089         // +0.5 for correct rounding when casting
2090         // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2091         return static_cast<uint64_t>(doubleNewIters + 0.5);
2092     }
2093 
2094     ANKERL_NANOBENCH_NO_SANITIZE("integer") void upscale(std::chrono::nanoseconds elapsed) {
2095         if (elapsed * 10 < mTargetRuntimePerEpoch) {
2096             // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2097             if (mNumIters * 10 < mNumIters) {
2098                 // overflow :-(
2099                 showResult("iterations overflow. Maybe your code got optimized away?");
2100                 mNumIters = 0;
2101                 return;
2102             }
2103             mNumIters *= 10;
2104         } else {
2105             mNumIters = calcBestNumIters(elapsed, mNumIters);
2106         }
2107     }
2108 
2109     void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2110 #    if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2111         auto oldIters = mNumIters;
2112 #    endif
2113 
2114         switch (mState) {
2115         case State::warmup:
2116             if (isCloseEnoughForMeasurements(elapsed)) {
2117                 // if elapsed is close enough, we can skip upscaling and go right to measurements
2118                 // still, we don't add the result to the measurements.
2119                 mState = State::measuring;
2120                 mNumIters = calcBestNumIters(elapsed, mNumIters);
2121             } else {
2122                 // not close enough: switch to upscaling
2123                 mState = State::upscaling_runtime;
2124                 upscale(elapsed);
2125             }
2126             break;
2127 
2128         case State::upscaling_runtime:
2129             if (isCloseEnoughForMeasurements(elapsed)) {
2130                 // if we are close enough, add measurement and switch to always measuring
2131                 mState = State::measuring;
2132                 mTotalElapsed += elapsed;
2133                 mTotalNumIters += mNumIters;
2134                 mResult.add(elapsed, mNumIters, pc);
2135                 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2136             } else {
2137                 upscale(elapsed);
2138             }
2139             break;
2140 
2141         case State::measuring:
2142             // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2143             // that fluctuation, or else we would bias the result
2144             mTotalElapsed += elapsed;
2145             mTotalNumIters += mNumIters;
2146             mResult.add(elapsed, mNumIters, pc);
2147             if (0 != mBench.epochIterations()) {
2148                 mNumIters = mBench.epochIterations();
2149             } else {
2150                 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2151             }
2152             break;
2153 
2154         case State::endless:
2155             mNumIters = (std::numeric_limits<uint64_t>::max)();
2156             break;
2157         }
2158 
2159         if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2160             // we got all the results that we need, finish it
2161             showResult("");
2162             mNumIters = 0;
2163         }
2164 
2165         ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, static_cast<double>(elapsed.count())) << " elapsed, "
2166                                            << detail::fmt::Number(20, 3, static_cast<double>(mTargetRuntimePerEpoch.count()))
2167                                            << " target. oldIters=" << oldIters << ", mNumIters=" << mNumIters
2168                                            << ", mState=" << static_cast<int>(mState));
2169     }
2170 
2171     void showResult(std::string const& errorMessage) const {
2172         ANKERL_NANOBENCH_LOG(errorMessage);
2173 
2174         if (mBench.output() != nullptr) {
2175             // prepare column data ///////
2176             std::vector<fmt::MarkDownColumn> columns;
2177 
2178             auto rMedian = mResult.median(Result::Measure::elapsed);
2179 
2180             if (mBench.relative()) {
2181                 double d = 100.0;
2182                 if (!mBench.results().empty()) {
2183                     d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2184                 }
2185                 columns.emplace_back(11, 1, "relative", "%", d);
2186             }
2187 
2188             if (mBench.complexityN() > 0) {
2189                 columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2190             }
2191 
2192             columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
2193                                  rMedian / (mBench.timeUnit().count() * mBench.batch()));
2194             columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2195 
2196             double rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2197             columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2198 
2199             double rInsMedian = -1.0;
2200             if (mResult.has(Result::Measure::instructions)) {
2201                 rInsMedian = mResult.median(Result::Measure::instructions);
2202                 columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2203             }
2204 
2205             double rCycMedian = -1.0;
2206             if (mResult.has(Result::Measure::cpucycles)) {
2207                 rCycMedian = mResult.median(Result::Measure::cpucycles);
2208                 columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2209             }
2210             if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2211                 columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2212             }
2213             if (mResult.has(Result::Measure::branchinstructions)) {
2214                 double rBraMedian = mResult.median(Result::Measure::branchinstructions);
2215                 columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2216                 if (mResult.has(Result::Measure::branchmisses)) {
2217                     double p = 0.0;
2218                     if (rBraMedian >= 1e-9) {
2219                         p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2220                     }
2221                     columns.emplace_back(10, 1, "miss%", "%", p);
2222                 }
2223             }
2224 
2225             columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
2226 
2227             // write everything
2228             auto& os = *mBench.output();
2229 
2230             // combine all elements that are relevant for printing the header
2231             uint64_t hash = 0;
2232             hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2233             hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2234             hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2235             hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2236             hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2237             hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2238 
2239             if (hash != singletonHeaderHash()) {
2240                 singletonHeaderHash() = hash;
2241 
2242                 // no result yet, print header
2243                 os << std::endl;
2244                 for (auto const& col : columns) {
2245                     os << col.title();
2246                 }
2247                 os << "| " << mBench.title() << std::endl;
2248 
2249                 for (auto const& col : columns) {
2250                     os << col.separator();
2251                 }
2252                 os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2253             }
2254 
2255             if (!errorMessage.empty()) {
2256                 for (auto const& col : columns) {
2257                     os << col.invalid();
2258                 }
2259                 os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2260             } else {
2261                 for (auto const& col : columns) {
2262                     os << col.value();
2263                 }
2264                 os << "| ";
2265                 auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2266                 if (showUnstable) {
2267                     os << ":wavy_dash: ";
2268                 }
2269                 os << fmt::MarkDownCode(mBench.name());
2270                 if (showUnstable) {
2271                     auto avgIters = static_cast<double>(mTotalNumIters) / static_cast<double>(mBench.epochs());
2272                     // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2273                     auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2274 
2275                     os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2276                        << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2277                 }
2278                 os << std::endl;
2279             }
2280         }
2281     }
2282 
2283     ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2284         return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2285     }
2286 
2287     uint64_t mNumIters = 1;
2288     Bench const& mBench;
2289     std::chrono::nanoseconds mTargetRuntimePerEpoch{};
2290     Result mResult;
2291     Rng mRng{123};
2292     std::chrono::nanoseconds mTotalElapsed{};
2293     uint64_t mTotalNumIters = 0;
2294 
2295     State mState = State::upscaling_runtime;
2296 };
ANKERL_NANOBENCH(IGNORE_PADDED_POP)2297 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2298 
2299 IterationLogic::IterationLogic(Bench const& bench) noexcept
2300     : mPimpl(new Impl(bench)) {}
2301 
~IterationLogic()2302 IterationLogic::~IterationLogic() {
2303     if (mPimpl) {
2304         delete mPimpl;
2305     }
2306 }
2307 
numIters()2308 uint64_t IterationLogic::numIters() const noexcept {
2309     ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2310     return mPimpl->mNumIters;
2311 }
2312 
add(std::chrono::nanoseconds elapsed,PerformanceCounters const & pc)2313 void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2314     mPimpl->add(elapsed, pc);
2315 }
2316 
moveResultTo(std::vector<Result> & results)2317 void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2318     results.emplace_back(std::move(mPimpl->mResult));
2319 }
2320 
2321 #    if ANKERL_NANOBENCH(PERF_COUNTERS)
2322 
ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)2323 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2324 class LinuxPerformanceCounters {
2325 public:
2326     struct Target {
2327         Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2328             : targetValue(targetValue_)
2329             , correctMeasuringOverhead(correctMeasuringOverhead_)
2330             , correctLoopOverhead(correctLoopOverhead_) {}
2331 
2332         uint64_t* targetValue{};
2333         bool correctMeasuringOverhead{};
2334         bool correctLoopOverhead{};
2335     };
2336 
2337     ~LinuxPerformanceCounters();
2338 
2339     // quick operation
2340     inline void start() {}
2341 
2342     inline void stop() {}
2343 
2344     bool monitor(perf_sw_ids swId, Target target);
2345     bool monitor(perf_hw_id hwId, Target target);
2346 
2347     bool hasError() const noexcept {
2348         return mHasError;
2349     }
2350 
2351     // Just reading data is faster than enable & disabling.
2352     // we subtract data ourselves.
2353     inline void beginMeasure() {
2354         if (mHasError) {
2355             return;
2356         }
2357 
2358         // NOLINTNEXTLINE(hicpp-signed-bitwise)
2359         mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2360         if (mHasError) {
2361             return;
2362         }
2363 
2364         // NOLINTNEXTLINE(hicpp-signed-bitwise)
2365         mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2366     }
2367 
2368     inline void endMeasure() {
2369         if (mHasError) {
2370             return;
2371         }
2372 
2373         // NOLINTNEXTLINE(hicpp-signed-bitwise)
2374         mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2375         if (mHasError) {
2376             return;
2377         }
2378 
2379         auto const numBytes = sizeof(uint64_t) * mCounters.size();
2380         auto ret = read(mFd, mCounters.data(), numBytes);
2381         mHasError = ret != static_cast<ssize_t>(numBytes);
2382     }
2383 
2384     void updateResults(uint64_t numIters);
2385 
2386     // rounded integer division
2387     template <typename T>
2388     static inline T divRounded(T a, T divisor) {
2389         return (a + divisor / 2) / divisor;
2390     }
2391 
2392     template <typename Op>
2393     ANKERL_NANOBENCH_NO_SANITIZE("integer")
2394     void calibrate(Op&& op) {
2395         // clear current calibration data,
2396         for (auto& v : mCalibratedOverhead) {
2397             v = UINT64_C(0);
2398         }
2399 
2400         // create new calibration data
2401         auto newCalibration = mCalibratedOverhead;
2402         for (auto& v : newCalibration) {
2403             v = (std::numeric_limits<uint64_t>::max)();
2404         }
2405         for (size_t iter = 0; iter < 100; ++iter) {
2406             beginMeasure();
2407             op();
2408             endMeasure();
2409             if (mHasError) {
2410                 return;
2411             }
2412 
2413             for (size_t i = 0; i < newCalibration.size(); ++i) {
2414                 auto diff = mCounters[i];
2415                 if (newCalibration[i] > diff) {
2416                     newCalibration[i] = diff;
2417                 }
2418             }
2419         }
2420 
2421         mCalibratedOverhead = std::move(newCalibration);
2422 
2423         {
2424             // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2425             // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2426             // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2427             // see https://godbolt.org/z/49RVQ5
2428             uint64_t const numIters = 100000U + (std::random_device{}() & 3);
2429             uint64_t n = numIters;
2430             uint32_t x = 1234567;
2431             auto fn = [&]() {
2432                 x ^= x << 13;
2433                 x ^= x >> 17;
2434                 x ^= x << 5;
2435             };
2436 
2437             beginMeasure();
2438             while (n-- > 0) {
2439                 fn();
2440             }
2441             endMeasure();
2442             detail::doNotOptimizeAway(x);
2443             auto measure1 = mCounters;
2444 
2445             n = numIters;
2446             beginMeasure();
2447             while (n-- > 0) {
2448                 // we now run *twice* so we can easily calculate the overhead
2449                 fn();
2450                 fn();
2451             }
2452             endMeasure();
2453             detail::doNotOptimizeAway(x);
2454             auto measure2 = mCounters;
2455 
2456             for (size_t i = 0; i < mCounters.size(); ++i) {
2457                 // factor 2 because we have two instructions per loop
2458                 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2459                 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2460                 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2461 
2462                 mLoopOverhead[i] = divRounded(overhead, numIters);
2463             }
2464         }
2465     }
2466 
2467 private:
2468     bool monitor(uint32_t type, uint64_t eventid, Target target);
2469 
2470     std::map<uint64_t, Target> mIdToTarget{};
2471 
2472     // start with minimum size of 3 for read_format
2473     std::vector<uint64_t> mCounters{3};
2474     std::vector<uint64_t> mCalibratedOverhead{3};
2475     std::vector<uint64_t> mLoopOverhead{3};
2476 
2477     uint64_t mTimeEnabledNanos = 0;
2478     uint64_t mTimeRunningNanos = 0;
2479     int mFd = -1;
2480     bool mHasError = false;
2481 };
ANKERL_NANOBENCH(IGNORE_PADDED_POP)2482 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2483 
2484 LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2485     if (-1 != mFd) {
2486         close(mFd);
2487     }
2488 }
2489 
monitor(perf_sw_ids swId,LinuxPerformanceCounters::Target target)2490 bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2491     return monitor(PERF_TYPE_SOFTWARE, swId, target);
2492 }
2493 
monitor(perf_hw_id hwId,LinuxPerformanceCounters::Target target)2494 bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2495     return monitor(PERF_TYPE_HARDWARE, hwId, target);
2496 }
2497 
2498 // overflow is ok, it's checked
2499 ANKERL_NANOBENCH_NO_SANITIZE("integer")
updateResults(uint64_t numIters)2500 void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2501     // clear old data
2502     for (auto& id_value : mIdToTarget) {
2503         *id_value.second.targetValue = UINT64_C(0);
2504     }
2505 
2506     if (mHasError) {
2507         return;
2508     }
2509 
2510     mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2511     mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2512 
2513     for (uint64_t i = 0; i < mCounters[0]; ++i) {
2514         auto idx = static_cast<size_t>(3 + i * 2 + 0);
2515         auto id = mCounters[idx + 1U];
2516 
2517         auto it = mIdToTarget.find(id);
2518         if (it != mIdToTarget.end()) {
2519 
2520             auto& tgt = it->second;
2521             *tgt.targetValue = mCounters[idx];
2522             if (tgt.correctMeasuringOverhead) {
2523                 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2524                     *tgt.targetValue -= mCalibratedOverhead[idx];
2525                 } else {
2526                     *tgt.targetValue = 0U;
2527                 }
2528             }
2529             if (tgt.correctLoopOverhead) {
2530                 auto correctionVal = mLoopOverhead[idx] * numIters;
2531                 if (*tgt.targetValue >= correctionVal) {
2532                     *tgt.targetValue -= correctionVal;
2533                 } else {
2534                     *tgt.targetValue = 0U;
2535                 }
2536             }
2537         }
2538     }
2539 }
2540 
monitor(uint32_t type,uint64_t eventid,Target target)2541 bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2542     *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2543     if (mHasError) {
2544         return false;
2545     }
2546 
2547     auto pea = perf_event_attr();
2548     std::memset(&pea, 0, sizeof(perf_event_attr));
2549     pea.type = type;
2550     pea.size = sizeof(perf_event_attr);
2551     pea.config = eventid;
2552     pea.disabled = 1; // start counter as disabled
2553     pea.exclude_kernel = 1;
2554     pea.exclude_hv = 1;
2555 
2556     // NOLINTNEXTLINE(hicpp-signed-bitwise)
2557     pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2558 
2559     const int pid = 0;                    // the current process
2560     const int cpu = -1;                   // all CPUs
2561 #        if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2562     const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2563 #        else
2564     const unsigned long flags = 0;
2565 #        endif
2566 
2567     auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2568     if (-1 == fd) {
2569         return false;
2570     }
2571     if (-1 == mFd) {
2572         // first call: set to fd, and use this from now on
2573         mFd = fd;
2574     }
2575     uint64_t id = 0;
2576     // NOLINTNEXTLINE(hicpp-signed-bitwise)
2577     if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2578         // couldn't get id
2579         return false;
2580     }
2581 
2582     // insert into map, rely on the fact that map's references are constant.
2583     mIdToTarget.emplace(id, target);
2584 
2585     // prepare readformat with the correct size (after the insert)
2586     auto size = 3 + 2 * mIdToTarget.size();
2587     mCounters.resize(size);
2588     mCalibratedOverhead.resize(size);
2589     mLoopOverhead.resize(size);
2590 
2591     return true;
2592 }
2593 
PerformanceCounters()2594 PerformanceCounters::PerformanceCounters()
2595     : mPc(new LinuxPerformanceCounters())
2596     , mVal()
2597     , mHas() {
2598 
2599     mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2600     mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2601     mHas.contextSwitches =
2602         mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2603     mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2604     mHas.branchInstructions =
2605         mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2606     mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2607     // mHas.branchMisses = false;
2608 
2609     mPc->start();
2610     mPc->calibrate([] {
2611         auto before = ankerl::nanobench::Clock::now();
2612         auto after = ankerl::nanobench::Clock::now();
2613         (void)before;
2614         (void)after;
2615     });
2616 
2617     if (mPc->hasError()) {
2618         // something failed, don't monitor anything.
2619         mHas = PerfCountSet<bool>{};
2620     }
2621 }
2622 
~PerformanceCounters()2623 PerformanceCounters::~PerformanceCounters() {
2624     if (nullptr != mPc) {
2625         delete mPc;
2626     }
2627 }
2628 
beginMeasure()2629 void PerformanceCounters::beginMeasure() {
2630     mPc->beginMeasure();
2631 }
2632 
endMeasure()2633 void PerformanceCounters::endMeasure() {
2634     mPc->endMeasure();
2635 }
2636 
updateResults(uint64_t numIters)2637 void PerformanceCounters::updateResults(uint64_t numIters) {
2638     mPc->updateResults(numIters);
2639 }
2640 
2641 #    else
2642 
2643 PerformanceCounters::PerformanceCounters() = default;
2644 PerformanceCounters::~PerformanceCounters() = default;
beginMeasure()2645 void PerformanceCounters::beginMeasure() {}
endMeasure()2646 void PerformanceCounters::endMeasure() {}
updateResults(uint64_t)2647 void PerformanceCounters::updateResults(uint64_t) {}
2648 
2649 #    endif
2650 
ANKERL_NANOBENCH(NODISCARD)2651 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2652     return mVal;
2653 }
ANKERL_NANOBENCH(NODISCARD)2654 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2655     return mHas;
2656 }
2657 
2658 // formatting utilities
2659 namespace fmt {
2660 
2661 // adds thousands separator to numbers
NumSep(char sep)2662 NumSep::NumSep(char sep)
2663     : mSep(sep) {}
2664 
do_thousands_sep()2665 char NumSep::do_thousands_sep() const {
2666     return mSep;
2667 }
2668 
do_grouping()2669 std::string NumSep::do_grouping() const {
2670     return "\003";
2671 }
2672 
2673 // RAII to save & restore a stream's state
StreamStateRestorer(std::ostream & s)2674 StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2675     : mStream(s)
2676     , mLocale(s.getloc())
2677     , mPrecision(s.precision())
2678     , mWidth(s.width())
2679     , mFill(s.fill())
2680     , mFmtFlags(s.flags()) {}
2681 
~StreamStateRestorer()2682 StreamStateRestorer::~StreamStateRestorer() {
2683     restore();
2684 }
2685 
2686 // sets back all stream info that we remembered at construction
restore()2687 void StreamStateRestorer::restore() {
2688     mStream.imbue(mLocale);
2689     mStream.precision(mPrecision);
2690     mStream.width(mWidth);
2691     mStream.fill(mFill);
2692     mStream.flags(mFmtFlags);
2693 }
2694 
Number(int width,int precision,int64_t value)2695 Number::Number(int width, int precision, int64_t value)
2696     : mWidth(width)
2697     , mPrecision(precision)
2698     , mValue(static_cast<double>(value)) {}
2699 
Number(int width,int precision,double value)2700 Number::Number(int width, int precision, double value)
2701     : mWidth(width)
2702     , mPrecision(precision)
2703     , mValue(value) {}
2704 
write(std::ostream & os)2705 std::ostream& Number::write(std::ostream& os) const {
2706     StreamStateRestorer restorer(os);
2707     os.imbue(std::locale(os.getloc(), new NumSep(',')));
2708     os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2709     return os;
2710 }
2711 
to_s()2712 std::string Number::to_s() const {
2713     std::stringstream ss;
2714     write(ss);
2715     return ss.str();
2716 }
2717 
to_s(uint64_t n)2718 std::string to_s(uint64_t n) {
2719     std::string str;
2720     do {
2721         str += static_cast<char>('0' + static_cast<char>(n % 10));
2722         n /= 10;
2723     } while (n != 0);
2724     std::reverse(str.begin(), str.end());
2725     return str;
2726 }
2727 
2728 std::ostream& operator<<(std::ostream& os, Number const& n) {
2729     return n.write(os);
2730 }
2731 
MarkDownColumn(int w,int prec,std::string const & tit,std::string const & suff,double val)2732 MarkDownColumn::MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val)
2733     : mWidth(w)
2734     , mPrecision(prec)
2735     , mTitle(tit)
2736     , mSuffix(suff)
2737     , mValue(val) {}
2738 
title()2739 std::string MarkDownColumn::title() const {
2740     std::stringstream ss;
2741     ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2742     return ss.str();
2743 }
2744 
separator()2745 std::string MarkDownColumn::separator() const {
2746     std::string sep(static_cast<size_t>(mWidth), '-');
2747     sep.front() = '|';
2748     sep.back() = ':';
2749     return sep;
2750 }
2751 
invalid()2752 std::string MarkDownColumn::invalid() const {
2753     std::string sep(static_cast<size_t>(mWidth), ' ');
2754     sep.front() = '|';
2755     sep[sep.size() - 2] = '-';
2756     return sep;
2757 }
2758 
value()2759 std::string MarkDownColumn::value() const {
2760     std::stringstream ss;
2761     auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2762     ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2763     return ss.str();
2764 }
2765 
2766 // Formats any text as markdown code, escaping backticks.
MarkDownCode(std::string const & what)2767 MarkDownCode::MarkDownCode(std::string const& what) {
2768     mWhat.reserve(what.size() + 2);
2769     mWhat.push_back('`');
2770     for (char c : what) {
2771         mWhat.push_back(c);
2772         if ('`' == c) {
2773             mWhat.push_back('`');
2774         }
2775     }
2776     mWhat.push_back('`');
2777 }
2778 
write(std::ostream & os)2779 std::ostream& MarkDownCode::write(std::ostream& os) const {
2780     return os << mWhat;
2781 }
2782 
2783 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2784     return mdCode.write(os);
2785 }
2786 } // namespace fmt
2787 } // namespace detail
2788 
2789 // provide implementation here so it's only generated once
2790 Config::Config() = default;
2791 Config::~Config() = default;
2792 Config& Config::operator=(Config const&) = default;
2793 Config& Config::operator=(Config&&) = default;
2794 Config::Config(Config const&) = default;
2795 Config::Config(Config&&) noexcept = default;
2796 
2797 // provide implementation here so it's only generated once
2798 Result::~Result() = default;
2799 Result& Result::operator=(Result const&) = default;
2800 Result& Result::operator=(Result&&) = default;
2801 Result::Result(Result const&) = default;
2802 Result::Result(Result&&) noexcept = default;
2803 
2804 namespace detail {
2805 template <typename T>
u(T val)2806 inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2807     return static_cast<typename std::underlying_type<T>::type>(val);
2808 }
2809 } // namespace detail
2810 
2811 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
Result(Config const & benchmarkConfig)2812 Result::Result(Config const& benchmarkConfig)
2813     : mConfig(benchmarkConfig)
2814     , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2815 
add(Clock::duration totalElapsed,uint64_t iters,detail::PerformanceCounters const & pc)2816 void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
2817     using detail::d;
2818     using detail::u;
2819 
2820     double dIters = d(iters);
2821     mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2822 
2823     mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2824     if (pc.has().pageFaults) {
2825         mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2826     }
2827     if (pc.has().cpuCycles) {
2828         mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2829     }
2830     if (pc.has().contextSwitches) {
2831         mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2832     }
2833     if (pc.has().instructions) {
2834         mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2835     }
2836     if (pc.has().branchInstructions) {
2837         double branchInstructions = 0.0;
2838         // correcting branches: remove branch introduced by the while (...) loop for each iteration.
2839         if (pc.val().branchInstructions > iters + 1U) {
2840             branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2841         }
2842         mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2843 
2844         if (pc.has().branchMisses) {
2845             // correcting branch misses
2846             double branchMisses = d(pc.val().branchMisses);
2847             if (branchMisses > branchInstructions) {
2848                 // can't have branch misses when there were branches...
2849                 branchMisses = branchInstructions;
2850             }
2851 
2852             // assuming at least one missed branch for the loop
2853             branchMisses -= 1.0;
2854             if (branchMisses < 1.0) {
2855                 branchMisses = 1.0;
2856             }
2857             mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2858         }
2859     }
2860 }
2861 
config()2862 Config const& Result::config() const noexcept {
2863     return mConfig;
2864 }
2865 
calcMedian(std::vector<double> & data)2866 inline double calcMedian(std::vector<double>& data) {
2867     if (data.empty()) {
2868         return 0.0;
2869     }
2870     std::sort(data.begin(), data.end());
2871 
2872     auto midIdx = data.size() / 2U;
2873     if (1U == (data.size() & 1U)) {
2874         return data[midIdx];
2875     }
2876     return (data[midIdx - 1U] + data[midIdx]) / 2U;
2877 }
2878 
median(Measure m)2879 double Result::median(Measure m) const {
2880     // create a copy so we can sort
2881     auto data = mNameToMeasurements[detail::u(m)];
2882     return calcMedian(data);
2883 }
2884 
average(Measure m)2885 double Result::average(Measure m) const {
2886     using detail::d;
2887     auto const& data = mNameToMeasurements[detail::u(m)];
2888     if (data.empty()) {
2889         return 0.0;
2890     }
2891 
2892     // create a copy so we can sort
2893     return sum(m) / d(data.size());
2894 }
2895 
medianAbsolutePercentError(Measure m)2896 double Result::medianAbsolutePercentError(Measure m) const {
2897     // create copy
2898     auto data = mNameToMeasurements[detail::u(m)];
2899 
2900     // calculates MdAPE which is the median of percentage error
2901     // see https://www.spiderfinancial.com/support/documentation/numxl/reference-manual/forecasting-performance/mdape
2902     auto med = calcMedian(data);
2903 
2904     // transform the data to absolute error
2905     for (auto& x : data) {
2906         x = (x - med) / x;
2907         if (x < 0) {
2908             x = -x;
2909         }
2910     }
2911     return calcMedian(data);
2912 }
2913 
sum(Measure m)2914 double Result::sum(Measure m) const noexcept {
2915     auto const& data = mNameToMeasurements[detail::u(m)];
2916     return std::accumulate(data.begin(), data.end(), 0.0);
2917 }
2918 
sumProduct(Measure m1,Measure m2)2919 double Result::sumProduct(Measure m1, Measure m2) const noexcept {
2920     auto const& data1 = mNameToMeasurements[detail::u(m1)];
2921     auto const& data2 = mNameToMeasurements[detail::u(m2)];
2922 
2923     if (data1.size() != data2.size()) {
2924         return 0.0;
2925     }
2926 
2927     double result = 0.0;
2928     for (size_t i = 0, s = data1.size(); i != s; ++i) {
2929         result += data1[i] * data2[i];
2930     }
2931     return result;
2932 }
2933 
has(Measure m)2934 bool Result::has(Measure m) const noexcept {
2935     return !mNameToMeasurements[detail::u(m)].empty();
2936 }
2937 
get(size_t idx,Measure m)2938 double Result::get(size_t idx, Measure m) const {
2939     auto const& data = mNameToMeasurements[detail::u(m)];
2940     return data.at(idx);
2941 }
2942 
empty()2943 bool Result::empty() const noexcept {
2944     return 0U == size();
2945 }
2946 
size()2947 size_t Result::size() const noexcept {
2948     auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
2949     return data.size();
2950 }
2951 
minimum(Measure m)2952 double Result::minimum(Measure m) const noexcept {
2953     auto const& data = mNameToMeasurements[detail::u(m)];
2954     if (data.empty()) {
2955         return 0.0;
2956     }
2957 
2958     // here its save to assume that at least one element is there
2959     return *std::min_element(data.begin(), data.end());
2960 }
2961 
maximum(Measure m)2962 double Result::maximum(Measure m) const noexcept {
2963     auto const& data = mNameToMeasurements[detail::u(m)];
2964     if (data.empty()) {
2965         return 0.0;
2966     }
2967 
2968     // here its save to assume that at least one element is there
2969     return *std::max_element(data.begin(), data.end());
2970 }
2971 
fromString(std::string const & str)2972 Result::Measure Result::fromString(std::string const& str) {
2973     if (str == "elapsed") {
2974         return Measure::elapsed;
2975     } else if (str == "iterations") {
2976         return Measure::iterations;
2977     } else if (str == "pagefaults") {
2978         return Measure::pagefaults;
2979     } else if (str == "cpucycles") {
2980         return Measure::cpucycles;
2981     } else if (str == "contextswitches") {
2982         return Measure::contextswitches;
2983     } else if (str == "instructions") {
2984         return Measure::instructions;
2985     } else if (str == "branchinstructions") {
2986         return Measure::branchinstructions;
2987     } else if (str == "branchmisses") {
2988         return Measure::branchmisses;
2989     } else {
2990         // not found, return _size
2991         return Measure::_size;
2992     }
2993 }
2994 
2995 // Configuration of a microbenchmark.
Bench()2996 Bench::Bench() {
2997     mConfig.mOut = &std::cout;
2998 }
2999 
3000 Bench::Bench(Bench&&) = default;
3001 Bench& Bench::operator=(Bench&&) = default;
3002 Bench::Bench(Bench const&) = default;
3003 Bench& Bench::operator=(Bench const&) = default;
3004 Bench::~Bench() noexcept = default;
3005 
batch()3006 double Bench::batch() const noexcept {
3007     return mConfig.mBatch;
3008 }
3009 
complexityN()3010 double Bench::complexityN() const noexcept {
3011     return mConfig.mComplexityN;
3012 }
3013 
3014 // Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
3015 // means it is slower than the baseline.
relative(bool isRelativeEnabled)3016 Bench& Bench::relative(bool isRelativeEnabled) noexcept {
3017     mConfig.mIsRelative = isRelativeEnabled;
3018     return *this;
3019 }
relative()3020 bool Bench::relative() const noexcept {
3021     return mConfig.mIsRelative;
3022 }
3023 
performanceCounters(bool showPerformanceCounters)3024 Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
3025     mConfig.mShowPerformanceCounters = showPerformanceCounters;
3026     return *this;
3027 }
performanceCounters()3028 bool Bench::performanceCounters() const noexcept {
3029     return mConfig.mShowPerformanceCounters;
3030 }
3031 
3032 // Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
3033 // If u differs from currently set unit, the stored results will be cleared.
3034 // Use singular (byte, not bytes).
unit(char const * u)3035 Bench& Bench::unit(char const* u) {
3036     if (u != mConfig.mUnit) {
3037         mResults.clear();
3038     }
3039     mConfig.mUnit = u;
3040     return *this;
3041 }
3042 
unit(std::string const & u)3043 Bench& Bench::unit(std::string const& u) {
3044     return unit(u.c_str());
3045 }
3046 
unit()3047 std::string const& Bench::unit() const noexcept {
3048     return mConfig.mUnit;
3049 }
3050 
timeUnit(std::chrono::duration<double> const & tu,std::string const & tuName)3051 Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
3052     mConfig.mTimeUnit = tu;
3053     mConfig.mTimeUnitName = tuName;
3054     return *this;
3055 }
3056 
timeUnitName()3057 std::string const& Bench::timeUnitName() const noexcept {
3058     return mConfig.mTimeUnitName;
3059 }
3060 
timeUnit()3061 std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
3062     return mConfig.mTimeUnit;
3063 }
3064 
3065 // If benchmarkTitle differs from currently set title, the stored results will be cleared.
title(const char * benchmarkTitle)3066 Bench& Bench::title(const char* benchmarkTitle) {
3067     if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3068         mResults.clear();
3069     }
3070     mConfig.mBenchmarkTitle = benchmarkTitle;
3071     return *this;
3072 }
title(std::string const & benchmarkTitle)3073 Bench& Bench::title(std::string const& benchmarkTitle) {
3074     if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3075         mResults.clear();
3076     }
3077     mConfig.mBenchmarkTitle = benchmarkTitle;
3078     return *this;
3079 }
3080 
title()3081 std::string const& Bench::title() const noexcept {
3082     return mConfig.mBenchmarkTitle;
3083 }
3084 
name(const char * benchmarkName)3085 Bench& Bench::name(const char* benchmarkName) {
3086     mConfig.mBenchmarkName = benchmarkName;
3087     return *this;
3088 }
3089 
name(std::string const & benchmarkName)3090 Bench& Bench::name(std::string const& benchmarkName) {
3091     mConfig.mBenchmarkName = benchmarkName;
3092     return *this;
3093 }
3094 
name()3095 std::string const& Bench::name() const noexcept {
3096     return mConfig.mBenchmarkName;
3097 }
3098 
3099 // Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
epochs(size_t numEpochs)3100 Bench& Bench::epochs(size_t numEpochs) noexcept {
3101     mConfig.mNumEpochs = numEpochs;
3102     return *this;
3103 }
epochs()3104 size_t Bench::epochs() const noexcept {
3105     return mConfig.mNumEpochs;
3106 }
3107 
3108 // Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
clockResolutionMultiple(size_t multiple)3109 Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3110     mConfig.mClockResolutionMultiple = multiple;
3111     return *this;
3112 }
clockResolutionMultiple()3113 size_t Bench::clockResolutionMultiple() const noexcept {
3114     return mConfig.mClockResolutionMultiple;
3115 }
3116 
3117 // Sets the maximum time each epoch should take. Default is 100ms.
maxEpochTime(std::chrono::nanoseconds t)3118 Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3119     mConfig.mMaxEpochTime = t;
3120     return *this;
3121 }
maxEpochTime()3122 std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3123     return mConfig.mMaxEpochTime;
3124 }
3125 
3126 // Sets the maximum time each epoch should take. Default is 100ms.
minEpochTime(std::chrono::nanoseconds t)3127 Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3128     mConfig.mMinEpochTime = t;
3129     return *this;
3130 }
minEpochTime()3131 std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3132     return mConfig.mMinEpochTime;
3133 }
3134 
minEpochIterations(uint64_t numIters)3135 Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3136     mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3137     return *this;
3138 }
minEpochIterations()3139 uint64_t Bench::minEpochIterations() const noexcept {
3140     return mConfig.mMinEpochIterations;
3141 }
3142 
epochIterations(uint64_t numIters)3143 Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3144     mConfig.mEpochIterations = numIters;
3145     return *this;
3146 }
epochIterations()3147 uint64_t Bench::epochIterations() const noexcept {
3148     return mConfig.mEpochIterations;
3149 }
3150 
warmup(uint64_t numWarmupIters)3151 Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3152     mConfig.mWarmup = numWarmupIters;
3153     return *this;
3154 }
warmup()3155 uint64_t Bench::warmup() const noexcept {
3156     return mConfig.mWarmup;
3157 }
3158 
config(Config const & benchmarkConfig)3159 Bench& Bench::config(Config const& benchmarkConfig) {
3160     mConfig = benchmarkConfig;
3161     return *this;
3162 }
config()3163 Config const& Bench::config() const noexcept {
3164     return mConfig;
3165 }
3166 
output(std::ostream * outstream)3167 Bench& Bench::output(std::ostream* outstream) noexcept {
3168     mConfig.mOut = outstream;
3169     return *this;
3170 }
3171 
ANKERL_NANOBENCH(NODISCARD)3172 ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3173     return mConfig.mOut;
3174 }
3175 
results()3176 std::vector<Result> const& Bench::results() const noexcept {
3177     return mResults;
3178 }
3179 
render(char const * templateContent,std::ostream & os)3180 Bench& Bench::render(char const* templateContent, std::ostream& os) {
3181     ::ankerl::nanobench::render(templateContent, *this, os);
3182     return *this;
3183 }
3184 
render(std::string const & templateContent,std::ostream & os)3185 Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
3186     ::ankerl::nanobench::render(templateContent, *this, os);
3187     return *this;
3188 }
3189 
complexityBigO()3190 std::vector<BigO> Bench::complexityBigO() const {
3191     std::vector<BigO> bigOs;
3192     auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3193     bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3194         return 1.0;
3195     });
3196     bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3197         return n;
3198     });
3199     bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3200         return std::log2(n);
3201     });
3202     bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3203         return n * std::log2(n);
3204     });
3205     bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3206         return n * n;
3207     });
3208     bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3209         return n * n * n;
3210     });
3211     std::sort(bigOs.begin(), bigOs.end());
3212     return bigOs;
3213 }
3214 
Rng()3215 Rng::Rng()
3216     : mX(0)
3217     , mY(0) {
3218     std::random_device rd;
3219     std::uniform_int_distribution<uint64_t> dist;
3220     do {
3221         mX = dist(rd);
3222         mY = dist(rd);
3223     } while (mX == 0 && mY == 0);
3224 }
3225 
3226 ANKERL_NANOBENCH_NO_SANITIZE("integer")
splitMix64(uint64_t & state)3227 uint64_t splitMix64(uint64_t& state) noexcept {
3228     uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3229     z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3230     z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3231     return z ^ (z >> 31U);
3232 }
3233 
3234 // Seeded as described in romu paper (update april 2020)
Rng(uint64_t seed)3235 Rng::Rng(uint64_t seed) noexcept
3236     : mX(splitMix64(seed))
3237     , mY(splitMix64(seed)) {
3238     for (size_t i = 0; i < 10; ++i) {
3239         operator()();
3240     }
3241 }
3242 
3243 // only internally used to copy the RNG.
Rng(uint64_t x,uint64_t y)3244 Rng::Rng(uint64_t x, uint64_t y) noexcept
3245     : mX(x)
3246     , mY(y) {}
3247 
copy()3248 Rng Rng::copy() const noexcept {
3249     return Rng{mX, mY};
3250 }
3251 
collectRangeMeasure(std::vector<Result> const & results)3252 BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3253     BigO::RangeMeasure rangeMeasure;
3254     for (auto const& result : results) {
3255         if (result.config().mComplexityN > 0.0) {
3256             rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3257         }
3258     }
3259     return rangeMeasure;
3260 }
3261 
BigO(std::string const & bigOName,RangeMeasure const & rangeMeasure)3262 BigO::BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure)
3263     : mName(bigOName) {
3264 
3265     // estimate the constant factor
3266     double sumRangeMeasure = 0.0;
3267     double sumRangeRange = 0.0;
3268 
3269     for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3270         sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second;
3271         sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first;
3272     }
3273     mConstant = sumRangeMeasure / sumRangeRange;
3274 
3275     // calculate root mean square
3276     double err = 0.0;
3277     double sumMeasure = 0.0;
3278     for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3279         auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second;
3280         err += diff * diff;
3281 
3282         sumMeasure += rangeMeasure[i].second;
3283     }
3284 
3285     auto n = static_cast<double>(rangeMeasure.size());
3286     auto mean = sumMeasure / n;
3287     mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3288 }
3289 
BigO(const char * bigOName,RangeMeasure const & rangeMeasure)3290 BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3291     : BigO(std::string(bigOName), rangeMeasure) {}
3292 
name()3293 std::string const& BigO::name() const noexcept {
3294     return mName;
3295 }
3296 
constant()3297 double BigO::constant() const noexcept {
3298     return mConstant;
3299 }
3300 
normalizedRootMeanSquare()3301 double BigO::normalizedRootMeanSquare() const noexcept {
3302     return mNormalizedRootMeanSquare;
3303 }
3304 
3305 bool BigO::operator<(BigO const& other) const noexcept {
3306     return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3307 }
3308 
3309 std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3310     return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3311 }
3312 
3313 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3314     detail::fmt::StreamStateRestorer restorer(os);
3315     os << std::endl << "|   coefficient |   err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3316     for (auto const& bigO : bigOs) {
3317         os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3318         os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3319         os << "| " << bigO.name();
3320         os << std::endl;
3321     }
3322     return os;
3323 }
3324 
3325 } // namespace nanobench
3326 } // namespace ankerl
3327 
3328 #endif // ANKERL_NANOBENCH_IMPLEMENT
3329 #endif // ANKERL_NANOBENCH_H_INCLUDED
3330