1 // __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _
2 // | \ | |_____| | \ | | | |_____] |______ | \ | | |_____|
3 // | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | |
4 //
5 // Microbenchmark framework for C++11/14/17/20
6 // https://github.com/martinus/nanobench
7 //
8 // Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9 // SPDX-License-Identifier: MIT
10 // Copyright (c) 2019-2020 Martin Ankerl <martin.ankerl@gmail.com>
11 //
12 // Permission is hereby granted, free of charge, to any person obtaining a copy
13 // of this software and associated documentation files (the "Software"), to deal
14 // in the Software without restriction, including without limitation the rights
15 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 // copies of the Software, and to permit persons to whom the Software is
17 // furnished to do so, subject to the following conditions:
18 //
19 // The above copyright notice and this permission notice shall be included in all
20 // copies or substantial portions of the Software.
21 //
22 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 // SOFTWARE.
29
30 #ifndef ANKERL_NANOBENCH_H_INCLUDED
31 #define ANKERL_NANOBENCH_H_INCLUDED
32
33 // see https://semver.org/
34 #define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35 #define ANKERL_NANOBENCH_VERSION_MINOR 2 // backwards-compatible changes
36 #define ANKERL_NANOBENCH_VERSION_PATCH 0 // backwards-compatible bug fixes
37
38 ///////////////////////////////////////////////////////////////////////////////////////////////////
39 // public facing api - as minimal as possible
40 ///////////////////////////////////////////////////////////////////////////////////////////////////
41
42 #include <chrono> // high_resolution_clock
43 #include <cstring> // memcpy
44 #include <iosfwd> // for std::ostream* custom output target in Config
45 #include <string> // all names
46 #include <vector> // holds all results
47
48 #define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
49
50 #define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
51 #define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
52 #define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
53 #define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
54 #define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
55
56 #if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
57 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
58 #else
59 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
60 #endif
61
62 #if defined(__clang__)
63 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
64 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
65 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
66 #else
67 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
68 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
69 #endif
70
71 #if defined(__GNUC__)
72 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
73 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
74 #else
75 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
76 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
77 #endif
78
79 #if defined(ANKERL_NANOBENCH_LOG_ENABLED)
80 # include <iostream>
81 # define ANKERL_NANOBENCH_LOG(x) \
82 do { \
83 std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
84 } while (0)
85 #else
86 # define ANKERL_NANOBENCH_LOG(x) \
87 do { \
88 } while (0)
89 #endif
90
91 #if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
92 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
93 #else
94 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
95 #endif
96
97 #if defined(__clang__)
98 # define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
99 #else
100 # define ANKERL_NANOBENCH_NO_SANITIZE(...)
101 #endif
102
103 #if defined(_MSC_VER)
104 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
105 #else
106 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
107 #endif
108
109 // workaround missing "is_trivially_copyable" in g++ < 5.0
110 // See https://stackoverflow.com/a/31798726/48181
111 #if defined(__GNUC__) && __GNUC__ < 5
112 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
113 #else
114 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
115 #endif
116
117 // declarations ///////////////////////////////////////////////////////////////////////////////////
118
119 namespace ankerl {
120 namespace nanobench {
121
122 using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
123 std::chrono::steady_clock>::type;
124 class Bench;
125 struct Config;
126 class Result;
127 class Rng;
128 class BigO;
129
130 /**
131 * @brief Renders output from a mustache-like template and benchmark results.
132 *
133 * The templating facility here is heavily inspired by [mustache - logic-less templates](https://mustache.github.io/).
134 * It adds a few more features that are necessary to get all of the captured data out of nanobench. Please read the
135 * excellent [mustache manual](https://mustache.github.io/mustache.5.html) to see what this is all about.
136 *
137 * nanobench output has two nested layers, *result* and *measurement*. Here is a hierarchy of the allowed tags:
138 *
139 * * `{{#result}}` Marks the begin of the result layer. Whatever comes after this will be instantiated as often as
140 * a benchmark result is available. Within it, you can use these tags:
141 *
142 * * `{{title}}` See Bench::title().
143 *
144 * * `{{name}}` Benchmark name, usually directly provided with Bench::run(), but can also be set with Bench::name().
145 *
146 * * `{{unit}}` Unit, e.g. `byte`. Defaults to `op`, see Bench::title().
147 *
148 * * `{{batch}}` Batch size, see Bench::batch().
149 *
150 * * `{{complexityN}}` Value used for asymptotic complexity calculation. See Bench::complexityN().
151 *
152 * * `{{epochs}}` Number of epochs, see Bench::epochs().
153 *
154 * * `{{clockResolution}}` Accuracy of the clock, i.e. what's the smallest time possible to measure with the clock.
155 * For modern systems, this can be around 20 ns. This value is automatically determined by nanobench at the first
156 * benchmark that is run, and used as a static variable throughout the application's runtime.
157 *
158 * * `{{clockResolutionMultiple}}` Configuration multiplier for `clockResolution`. See Bench::clockResolutionMultiple().
159 * This is the target runtime for each measurement (epoch). That means the more accurate your clock is, the faster
160 * will be the benchmark. Basing the measurement's runtime on the clock resolution is the main reason why nanobench is so fast.
161 *
162 * * `{{maxEpochTime}}` Configuration for a maximum time each measurement (epoch) is allowed to take. Note that at least
163 * a single iteration will be performed, even when that takes longer than maxEpochTime. See Bench::maxEpochTime().
164 *
165 * * `{{minEpochTime}}` Minimum epoch time, usually not set. See Bench::minEpochTime().
166 *
167 * * `{{minEpochIterations}}` See Bench::minEpochIterations().
168 *
169 * * `{{epochIterations}}` See Bench::epochIterations().
170 *
171 * * `{{warmup}}` Number of iterations used before measuring starts. See Bench::warmup().
172 *
173 * * `{{relative}}` True or false, depending on the setting you have used. See Bench::relative().
174 *
175 * Apart from these tags, it is also possible to use some mathematical operations on the measurement data. The operations
176 * are of the form `{{command(name)}}`. Currently `name` can be one of `elapsed`, `iterations`. If performance counters
177 * are available (currently only on current Linux systems), you also have `pagefaults`, `cpucycles`,
178 * `contextswitches`, `instructions`, `branchinstructions`, and `branchmisses`. All the measuers (except `iterations`) are
179 * provided for a single iteration (so `elapsed` is the time a single iteration took). The following tags are available:
180 *
181 * * `{{median(<name>)}}` Calculate median of a measurement data set, e.g. `{{median(elapsed)}}`.
182 *
183 * * `{{average(<name>)}}` Average (mean) calculation.
184 *
185 * * `{{medianAbsolutePercentError(<name>)}}` Calculates MdAPE, the Median Absolute Percentage Error. The MdAPE is an excellent
186 * metric for the variation of measurements. It is more robust to outliers than the
187 * [Mean absolute percentage error (M-APE)](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error).
188 * @f[
189 * \mathrm{MdAPE}(e) = \mathrm{med}\{| \frac{e_i - \mathrm{med}\{e\}}{e_i}| \}
190 * @f]
191 * E.g. for *elapsed*: First, @f$ \mathrm{med}\{e\} @f$ calculates the median by sorting and then taking the middle element
192 * of all *elapsed* measurements. This is used to calculate the absolute percentage
193 * error to this median for each measurement, as in @f$ | \frac{e_i - \mathrm{med}\{e\}}{e_i}| @f$. All these results
194 * are sorted, and the middle value is chosen as the median absolute percent error.
195 *
196 * This measurement is a bit hard to interpret, but it is very robust against outliers. E.g. a value of 5% means that half of the
197 * measurements deviate less than 5% from the median, and the other deviate more than 5% from the median.
198 *
199 * * `{{sum(<name>)}}` Sums of all the measurements. E.g. `{{sum(iterations)}}` will give you the total number of iterations
200 * measured in this benchmark.
201 *
202 * * `{{minimum(<name>)}}` Minimum of all measurements.
203 *
204 * * `{{maximum(<name>)}}` Maximum of all measurements.
205 *
206 * * `{{sumProduct(<first>, <second>)}}` Calculates the sum of the products of corresponding measures:
207 * @f[
208 * \mathrm{sumProduct}(a,b) = \sum_{i=1}^{n}a_i\cdot b_i
209 * @f]
210 * E.g. to calculate total runtime of the benchmark, you multiply iterations with elapsed time for each measurement, and
211 * sum these results up:
212 * `{{sumProduct(iterations, elapsed)}}`.
213 *
214 * * `{{#measurement}}` To access individual measurement results, open the begin tag for measurements.
215 *
216 * * `{{elapsed}}` Average elapsed wall clock time per iteration, in seconds.
217 *
218 * * `{{iterations}}` Number of iterations in the measurement. The number of iterations will fluctuate due
219 * to some applied randomness, to enhance accuracy.
220 *
221 * * `{{pagefaults}}` Average number of pagefaults per iteration.
222 *
223 * * `{{cpucycles}}` Average number of CPU cycles processed per iteration.
224 *
225 * * `{{contextswitches}}` Average number of context switches per iteration.
226 *
227 * * `{{instructions}}` Average number of retired instructions per iteration.
228 *
229 * * `{{branchinstructions}}` Average number of branches executed per iteration.
230 *
231 * * `{{branchmisses}}` Average number of branches that were missed per iteration.
232 *
233 * * `{{/measurement}}` Ends the measurement tag.
234 *
235 * * `{{/result}}` Marks the end of the result layer. This is the end marker for the template part that will be instantiated
236 * for each benchmark result.
237 *
238 *
239 * For the layer tags *result* and *measurement* you additionally can use these special markers:
240 *
241 * * ``{{#-first}}`` - Begin marker of a template that will be instantiated *only for the first* entry in the layer. Use is only
242 * allowed between the begin and end marker of the layer allowed. So between ``{{#result}}`` and ``{{/result}}``, or between
243 * ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-first}}``.
244 *
245 * * ``{{^-first}}`` - Begin marker of a template that will be instantiated *for each except the first* entry in the layer. This,
246 * this is basically the inversion of ``{{#-first}}``. Use is only allowed between the begin and end marker of the layer allowed.
247 * So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``.
248 *
249 * * ``{{/-first}}`` - End marker for either ``{{#-first}}`` or ``{{^-first}}``.
250 *
251 * * ``{{#-last}}`` - Begin marker of a template that will be instantiated *only for the last* entry in the layer. Use is only
252 * allowed between the begin and end marker of the layer allowed. So between ``{{#result}}`` and ``{{/result}}``, or between
253 * ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-last}}``.
254 *
255 * * ``{{^-last}}`` - Begin marker of a template that will be instantiated *for each except the last* entry in the layer. This,
256 * this is basically the inversion of ``{{#-last}}``. Use is only allowed between the begin and end marker of the layer allowed.
257 * So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``.
258 *
259 * * ``{{/-last}}`` - End marker for either ``{{#-last}}`` or ``{{^-last}}``.
260 *
261 @verbatim embed:rst
262
263 For an overview of all the possible data you can get out of nanobench, please see the tutorial at :ref:`tutorial-template-json`.
264
265 The templates that ship with nanobench are:
266
267 * :cpp:func:`templates::csv() <ankerl::nanobench::templates::csv()>`
268 * :cpp:func:`templates::json() <ankerl::nanobench::templates::json()>`
269 * :cpp:func:`templates::htmlBoxplot() <ankerl::nanobench::templates::htmlBoxplot()>`
270 * :cpp:func:`templates::pyperf() <ankerl::nanobench::templates::pyperf()>`
271
272 @endverbatim
273 *
274 * @param mustacheTemplate The template.
275 * @param bench Benchmark, containing all the results.
276 * @param out Output for the generated output.
277 */
278 void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
279 void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
280
281 /**
282 * Same as render(char const* mustacheTemplate, Bench const& bench, std::ostream& out), but for when
283 * you only have results available.
284 *
285 * @param mustacheTemplate The template.
286 * @param results All the results to be used for rendering.
287 * @param out Output for the generated output.
288 */
289 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
290 void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
291
292 // Contains mustache-like templates
293 namespace templates {
294
295 /*!
296 @brief CSV data for the benchmark results.
297
298 Generates a comma-separated values dataset. First line is the header, each following line is a summary of each benchmark run.
299
300 @verbatim embed:rst
301 See the tutorial at :ref:`tutorial-template-csv` for an example.
302 @endverbatim
303 */
304 char const* csv() noexcept;
305
306 /*!
307 @brief HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an example output.
308
309 The output uses only the elapsed wall clock time, and displays each epoch as a single dot.
310 @verbatim embed:rst
311 See the tutorial at :ref:`tutorial-template-html` for an example.
312 @endverbatim
313
314 @see ankerl::nanobench::render()
315 */
316 char const* htmlBoxplot() noexcept;
317
318 /*!
319 @brief Output in pyperf compatible JSON format, which can be used for more analyzations.
320 @verbatim embed:rst
321 See the tutorial at :ref:`tutorial-template-pyperf` for an example how to further analyze the output.
322 @endverbatim
323 */
324 char const* pyperf() noexcept;
325
326 /*!
327 @brief Template to generate JSON data.
328
329 The generated JSON data contains *all* data that has been generated. All times are as double values, in seconds. The output can get
330 quite large.
331 @verbatim embed:rst
332 See the tutorial at :ref:`tutorial-template-json` for an example.
333 @endverbatim
334 */
335 char const* json() noexcept;
336
337 } // namespace templates
338
339 namespace detail {
340
341 template <typename T>
342 struct PerfCountSet;
343
344 class IterationLogic;
345 class PerformanceCounters;
346
347 #if ANKERL_NANOBENCH(PERF_COUNTERS)
348 class LinuxPerformanceCounters;
349 #endif
350
351 } // namespace detail
352 } // namespace nanobench
353 } // namespace ankerl
354
355 // definitions ////////////////////////////////////////////////////////////////////////////////////
356
357 namespace ankerl {
358 namespace nanobench {
359 namespace detail {
360
361 template <typename T>
362 struct PerfCountSet {
363 T pageFaults{};
364 T cpuCycles{};
365 T contextSwitches{};
366 T instructions{};
367 T branchInstructions{};
368 T branchMisses{};
369 };
370
371 } // namespace detail
372
373 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
374 struct Config {
375 // actual benchmark config
376 std::string mBenchmarkTitle = "benchmark";
377 std::string mBenchmarkName = "noname";
378 std::string mUnit = "op";
379 double mBatch = 1.0;
380 double mComplexityN = -1.0;
381 size_t mNumEpochs = 11;
382 size_t mClockResolutionMultiple = static_cast<size_t>(1000);
383 std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
384 std::chrono::nanoseconds mMinEpochTime{};
385 uint64_t mMinEpochIterations{1};
386 uint64_t mEpochIterations{0}; // If not 0, run *exactly* these number of iterations per epoch.
387 uint64_t mWarmup = 0;
388 std::ostream* mOut = nullptr;
389 std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1};
390 std::string mTimeUnitName = "ns";
391 bool mShowPerformanceCounters = true;
392 bool mIsRelative = false;
393
394 Config();
395 ~Config();
396 Config& operator=(Config const&);
397 Config& operator=(Config&&);
398 Config(Config const&);
399 Config(Config&&) noexcept;
400 };
401 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
402
403 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)404 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
405 class Result {
406 public:
407 enum class Measure : size_t {
408 elapsed,
409 iterations,
410 pagefaults,
411 cpucycles,
412 contextswitches,
413 instructions,
414 branchinstructions,
415 branchmisses,
416 _size
417 };
418
419 explicit Result(Config const& benchmarkConfig);
420
421 ~Result();
422 Result& operator=(Result const&);
423 Result& operator=(Result&&);
424 Result(Result const&);
425 Result(Result&&) noexcept;
426
427 // adds new measurement results
428 // all values are scaled by iters (except iters...)
429 void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
430
431 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
432
433 ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
434 ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
435 ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
436 ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
437 ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
438 ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
439 ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
440
441 ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
442 ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
443 ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
444 ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
445
446 // Finds string, if not found, returns _size.
447 static Measure fromString(std::string const& str);
448
449 private:
450 Config mConfig{};
451 std::vector<std::vector<double>> mNameToMeasurements{};
452 };
ANKERL_NANOBENCH(IGNORE_PADDED_POP)453 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
454
455 /**
456 * An extremely fast random generator. Currently, this implements *RomuDuoJr*, developed by Mark Overton. Source:
457 * http://www.romu-random.org/
458 *
459 * RomuDuoJr is extremely fast and provides reasonable good randomness. Not enough for large jobs, but definitely
460 * good enough for a benchmarking framework.
461 *
462 * * Estimated capacity: @f$ 2^{51} @f$ bytes
463 * * Register pressure: 4
464 * * State size: 128 bits
465 *
466 * This random generator is a drop-in replacement for the generators supplied by ``<random>``. It is not
467 * cryptographically secure. It's intended purpose is to be very fast so that benchmarks that make use
468 * of randomness are not distorted too much by the random generator.
469 *
470 * Rng also provides a few non-standard helpers, optimized for speed.
471 */
472 class Rng final {
473 public:
474 /**
475 * @brief This RNG provides 64bit randomness.
476 */
477 using result_type = uint64_t;
478
479 static constexpr uint64_t(min)();
480 static constexpr uint64_t(max)();
481
482 /**
483 * As a safety precausion, we don't allow copying. Copying a PRNG would mean you would have two random generators that produce the
484 * same sequence, which is generally not what one wants. Instead create a new rng with the default constructor Rng(), which is
485 * automatically seeded from `std::random_device`. If you really need a copy, use copy().
486 */
487 Rng(Rng const&) = delete;
488
489 /**
490 * Same as Rng(Rng const&), we don't allow assignment. If you need a new Rng create one with the default constructor Rng().
491 */
492 Rng& operator=(Rng const&) = delete;
493
494 // moving is ok
495 Rng(Rng&&) noexcept = default;
496 Rng& operator=(Rng&&) noexcept = default;
497 ~Rng() noexcept = default;
498
499 /**
500 * @brief Creates a new Random generator with random seed.
501 *
502 * Instead of a default seed (as the random generators from the STD), this properly seeds the random generator from
503 * `std::random_device`. It guarantees correct seeding. Note that seeding can be relatively slow, depending on the source of
504 * randomness used. So it is best to create a Rng once and use it for all your randomness purposes.
505 */
506 Rng();
507
508 /*!
509 Creates a new Rng that is seeded with a specific seed. Each Rng created from the same seed will produce the same randomness
510 sequence. This can be useful for deterministic behavior.
511
512 @verbatim embed:rst
513 .. note::
514
515 The random algorithm might change between nanobench releases. Whenever a faster and/or better random
516 generator becomes available, I will switch the implementation.
517 @endverbatim
518
519 As per the Romu paper, this seeds the Rng with splitMix64 algorithm and performs 10 initial rounds for further mixing up of the
520 internal state.
521
522 @param seed The 64bit seed. All values are allowed, even 0.
523 */
524 explicit Rng(uint64_t seed) noexcept;
525 Rng(uint64_t x, uint64_t y) noexcept;
526
527 /**
528 * Creates a copy of the Rng, thus the copy provides exactly the same random sequence as the original.
529 */
530 ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
531
532 /**
533 * @brief Produces a 64bit random value. This should be very fast, thus it is marked as inline. In my benchmark, this is ~46 times
534 * faster than `std::default_random_engine` for producing 64bit random values. It seems that the fastest std contender is
535 * `std::mt19937_64`. Still, this RNG is 2-3 times as fast.
536 *
537 * @return uint64_t The next 64 bit random value.
538 */
539 inline uint64_t operator()() noexcept;
540
541 // This is slightly biased. See
542
543 /**
544 * Generates a random number between 0 and range (excluding range).
545 *
546 * The algorithm only produces 32bit numbers, and is slightly biased. The effect is quite small unless your range is close to the
547 * maximum value of an integer. It is possible to correct the bias with rejection sampling (see
548 * [here](https://lemire.me/blog/2016/06/30/fast-random-shuffling/), but this is most likely irrelevant in practices for the
549 * purposes of this Rng.
550 *
551 * See Daniel Lemire's blog post [A fast alternative to the modulo
552 * reduction](https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/)
553 *
554 * @param range Upper exclusive range. E.g a value of 3 will generate random numbers 0, 1, 2.
555 * @return uint32_t Generated random values in range [0, range(.
556 */
557 inline uint32_t bounded(uint32_t range) noexcept;
558
559 // random double in range [0, 1(
560 // see http://prng.di.unimi.it/
561
562 /**
563 * Provides a random uniform double value between 0 and 1. This uses the method described in [Generating uniform doubles in the
564 * unit interval](http://prng.di.unimi.it/), and is extremely fast.
565 *
566 * @return double Uniformly distributed double value in range [0,1(, excluding 1.
567 */
568 inline double uniform01() noexcept;
569
570 /**
571 * Shuffles all entries in the given container. Although this has a slight bias due to the implementation of bounded(), this is
572 * preferable to `std::shuffle` because it is over 5 times faster. See Daniel Lemire's blog post [Fast random
573 * shuffling](https://lemire.me/blog/2016/06/30/fast-random-shuffling/).
574 *
575 * @param container The whole container will be shuffled.
576 */
577 template <typename Container>
578 void shuffle(Container& container) noexcept;
579
580 private:
581 static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
582
583 uint64_t mX;
584 uint64_t mY;
585 };
586
587 /**
588 * @brief Main entry point to nanobench's benchmarking facility.
589 *
590 * It holds configuration and results from one or more benchmark runs. Usually it is used in a single line, where the object is
591 * constructed, configured, and then a benchmark is run. E.g. like this:
592 *
593 * ankerl::nanobench::Bench().unit("byte").batch(1000).run("random fluctuations", [&] {
594 * // here be the benchmark code
595 * });
596 *
597 * In that example Bench() constructs the benchmark, it is then configured with unit() and batch(), and after configuration a
598 * benchmark is executed with run(). Once run() has finished, it prints the result to `std::cout`. It would also store the results
599 * in the Bench instance, but in this case the object is immediately destroyed so it's not available any more.
600 */
ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)601 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
602 class Bench {
603 public:
604 /**
605 * @brief Creates a new benchmark for configuration and running of benchmarks.
606 */
607 Bench();
608
609 Bench(Bench&& other);
610 Bench& operator=(Bench&& other);
611 Bench(Bench const& other);
612 Bench& operator=(Bench const& other);
613 ~Bench() noexcept;
614
615 /*!
616 @brief Repeatedly calls `op()` based on the configuration, and performs measurements.
617
618 This call is marked with `noinline` to prevent the compiler to optimize beyond different benchmarks. This can have quite a big
619 effect on benchmark accuracy.
620
621 @verbatim embed:rst
622 .. note::
623
624 Each call to your lambda must have a side effect that the compiler can't possibly optimize it away. E.g. add a result to an
625 externally defined number (like `x` in the above example), and finally call `doNotOptimizeAway` on the variables the compiler
626 must not remove. You can also use :cpp:func:`ankerl::nanobench::doNotOptimizeAway` directly in the lambda, but be aware that
627 this has a small overhead.
628
629 @endverbatim
630
631 @tparam Op The code to benchmark.
632 */
633 template <typename Op>
634 ANKERL_NANOBENCH(NOINLINE)
635 Bench& run(char const* benchmarkName, Op&& op);
636
637 template <typename Op>
638 ANKERL_NANOBENCH(NOINLINE)
639 Bench& run(std::string const& benchmarkName, Op&& op);
640
641 /**
642 * @brief Same as run(char const* benchmarkName, Op op), but instead uses the previously set name.
643 * @tparam Op The code to benchmark.
644 */
645 template <typename Op>
646 ANKERL_NANOBENCH(NOINLINE)
647 Bench& run(Op&& op);
648
649 /**
650 * @brief Title of the benchmark, will be shown in the table header. Changing the title will start a new markdown table.
651 *
652 * @param benchmarkTitle The title of the benchmark.
653 */
654 Bench& title(char const* benchmarkTitle);
655 Bench& title(std::string const& benchmarkTitle);
656 ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
657
658 /// Name of the benchmark, will be shown in the table row.
659 Bench& name(char const* benchmarkName);
660 Bench& name(std::string const& benchmarkName);
661 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
662
663 /**
664 * @brief Sets the batch size.
665 *
666 * E.g. number of processed byte, or some other metric for the size of the processed data in each iteration. If you benchmark
667 * hashing of a 1000 byte long string and want byte/sec as a result, you can specify 1000 as the batch size.
668 *
669 * @tparam T Any input type is internally cast to `double`.
670 * @param b batch size
671 */
672 template <typename T>
673 Bench& batch(T b) noexcept;
674 ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
675
676 /**
677 * @brief Sets the operation unit.
678 *
679 * Defaults to "op". Could be e.g. "byte" for string processing. This is used for the table header, e.g. to show `ns/byte`. Use
680 * singular (*byte*, not *bytes*). A change clears the currently collected results.
681 *
682 * @param unit The unit name.
683 */
684 Bench& unit(char const* unit);
685 Bench& unit(std::string const& unit);
686 ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
687
688 /**
689 * @brief Sets the time unit to be used for the default output.
690 *
691 * Nanobench defaults to using ns (nanoseconds) as output in the markdown. For some benchmarks this is too coarse, so it is
692 * possible to configure this. E.g. use `timeUnit(1ms, "ms")` to show `ms/op` instead of `ns/op`.
693 *
694 * @param tu Time unit to display the results in, default is 1ns.
695 * @param tuName Name for the time unit, default is "ns"
696 */
697 Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
698 ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
699 ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
700
701 /**
702 * @brief Set the output stream where the resulting markdown table will be printed to.
703 *
704 * The default is `&std::cout`. You can disable all output by setting `nullptr`.
705 *
706 * @param outstream Pointer to output stream, can be `nullptr`.
707 */
708 Bench& output(std::ostream* outstream) noexcept;
709 ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
710
711 /**
712 * Modern processors have a very accurate clock, being able to measure as low as 20 nanoseconds. This is the main trick nanobech to
713 * be so fast: we find out how accurate the clock is, then run the benchmark only so often that the clock's accuracy is good enough
714 * for accurate measurements.
715 *
716 * The default is to run one epoch for 1000 times the clock resolution. So for 20ns resolution and 11 epochs, this gives a total
717 * runtime of
718 *
719 * @f[
720 * 20ns * 1000 * 11 \approx 0.2ms
721 * @f]
722 *
723 * To be precise, nanobench adds a 0-20% random noise to each evaluation. This is to prevent any aliasing effects, and further
724 * improves accuracy.
725 *
726 * Total runtime will be higher though: Some initial time is needed to find out the target number of iterations for each epoch, and
727 * there is some overhead involved to start & stop timers and calculate resulting statistics and writing the output.
728 *
729 * @param multiple Target number of times of clock resolution. Usually 1000 is a good compromise between runtime and accuracy.
730 */
731 Bench& clockResolutionMultiple(size_t multiple) noexcept;
732 ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
733
734 /**
735 * @brief Controls number of epochs, the number of measurements to perform.
736 *
737 * The reported result will be the median of evaluation of each epoch. The higher you choose this, the more
738 * deterministic the result be and outliers will be more easily removed. Also the `err%` will be more accurate the higher this
739 * number is. Note that the `err%` will not necessarily decrease when number of epochs is increased. But it will be a more accurate
740 * representation of the benchmarked code's runtime stability.
741 *
742 * Choose the value wisely. In practice, 11 has been shown to be a reasonable choice between runtime performance and accuracy.
743 * This setting goes hand in hand with minEpocIterations() (or minEpochTime()). If you are more interested in *median* runtime, you
744 * might want to increase epochs(). If you are more interested in *mean* runtime, you might want to increase minEpochIterations()
745 * instead.
746 *
747 * @param numEpochs Number of epochs.
748 */
749 Bench& epochs(size_t numEpochs) noexcept;
750 ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
751
752 /**
753 * @brief Upper limit for the runtime of each epoch.
754 *
755 * As a safety precausion if the clock is not very accurate, we can set an upper limit for the maximum evaluation time per
756 * epoch. Default is 100ms. At least a single evaluation of the benchmark is performed.
757 *
758 * @see minEpochTime(), minEpochIterations()
759 *
760 * @param t Maximum target runtime for a single epoch.
761 */
762 Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
763 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
764
765 /**
766 * @brief Minimum time each epoch should take.
767 *
768 * Default is zero, so we are fully relying on clockResolutionMultiple(). In most cases this is exactly what you want. If you see
769 * that the evaluation is unreliable with a high `err%`, you can increase either minEpochTime() or minEpochIterations().
770 *
771 * @see maxEpochTime(), minEpochIterations()
772 *
773 * @param t Minimum time each epoch should take.
774 */
775 Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
776 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
777
778 /**
779 * @brief Sets the minimum number of iterations each epoch should take.
780 *
781 * Default is 1, and we rely on clockResolutionMultiple(). If the `err%` is high and you want a more smooth result, you might want
782 * to increase the minimum number or iterations, or increase the minEpochTime().
783 *
784 * @see minEpochTime(), maxEpochTime(), minEpochIterations()
785 *
786 * @param numIters Minimum number of iterations per epoch.
787 */
788 Bench& minEpochIterations(uint64_t numIters) noexcept;
789 ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
790
791 /**
792 * Sets exactly the number of iterations for each epoch. Ignores all other epoch limits. This forces nanobench to use exactly
793 * the given number of iterations for each epoch, not more and not less. Default is 0 (disabled).
794 *
795 * @param numIters Exact number of iterations to use. Set to 0 to disable.
796 */
797 Bench& epochIterations(uint64_t numIters) noexcept;
798 ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
799
800 /**
801 * @brief Sets a number of iterations that are initially performed without any measurements.
802 *
803 * Some benchmarks need a few evaluations to warm up caches / database / whatever access. Normally this should not be needed, since
804 * we show the median result so initial outliers will be filtered away automatically. If the warmup effect is large though, you
805 * might want to set it. Default is 0.
806 *
807 * @param numWarmupIters Number of warmup iterations.
808 */
809 Bench& warmup(uint64_t numWarmupIters) noexcept;
810 ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
811
812 /**
813 * @brief Marks the next run as the baseline.
814 *
815 * Call `relative(true)` to mark the run as the baseline. Successive runs will be compared to this run. It is calculated by
816 *
817 * @f[
818 * 100\% * \frac{baseline}{runtime}
819 * @f]
820 *
821 * * 100% means it is exactly as fast as the baseline
822 * * >100% means it is faster than the baseline. E.g. 200% means the current run is twice as fast as the baseline.
823 * * <100% means it is slower than the baseline. E.g. 50% means it is twice as slow as the baseline.
824 *
825 * See the tutorial section "Comparing Results" for example usage.
826 *
827 * @param isRelativeEnabled True to enable processing
828 */
829 Bench& relative(bool isRelativeEnabled) noexcept;
830 ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
831
832 /**
833 * @brief Enables/disables performance counters.
834 *
835 * On Linux nanobench has a powerful feature to use performance counters. This enables counting of retired instructions, count
836 * number of branches, missed branches, etc. On default this is enabled, but you can disable it if you don't need that feature.
837 *
838 * @param showPerformanceCounters True to enable, false to disable.
839 */
840 Bench& performanceCounters(bool showPerformanceCounters) noexcept;
841 ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
842
843 /**
844 * @brief Retrieves all benchmark results collected by the bench object so far.
845 *
846 * Each call to run() generates a Result that is stored within the Bench instance. This is mostly for advanced users who want to
847 * see all the nitty gritty detials.
848 *
849 * @return All results collected so far.
850 */
851 ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
852
853 /*!
854 @verbatim embed:rst
855
856 Convenience shortcut to :cpp:func:`ankerl::nanobench::doNotOptimizeAway`.
857
858 @endverbatim
859 */
860 template <typename Arg>
861 Bench& doNotOptimizeAway(Arg&& arg);
862
863 /*!
864 @verbatim embed:rst
865
866 Sets N for asymptotic complexity calculation, so it becomes possible to calculate `Big O
867 <https://en.wikipedia.org/wiki/Big_O_notation>`_ from multiple benchmark evaluations.
868
869 Use :cpp:func:`ankerl::nanobench::Bench::complexityBigO` when the evaluation has finished. See the tutorial
870 :ref:`asymptotic-complexity` for details.
871
872 @endverbatim
873
874 @tparam T Any type is cast to `double`.
875 @param b Length of N for the next benchmark run, so it is possible to calculate `bigO`.
876 */
877 template <typename T>
878 Bench& complexityN(T b) noexcept;
879 ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
880
881 /*!
882 Calculates [Big O](https://en.wikipedia.org/wiki/Big_O_notation>) of the results with all preconfigured complexity functions.
883 Currently these complexity functions are fitted into the benchmark results:
884
885 @f$ \mathcal{O}(1) @f$,
886 @f$ \mathcal{O}(n) @f$,
887 @f$ \mathcal{O}(\log{}n) @f$,
888 @f$ \mathcal{O}(n\log{}n) @f$,
889 @f$ \mathcal{O}(n^2) @f$,
890 @f$ \mathcal{O}(n^3) @f$.
891
892 If we e.g. evaluate the complexity of `std::sort`, this is the result of `std::cout << bench.complexityBigO()`:
893
894 ```
895 | coefficient | err% | complexity
896 |--------------:|-------:|------------
897 | 5.08935e-09 | 2.6% | O(n log n)
898 | 6.10608e-08 | 8.0% | O(n)
899 | 1.29307e-11 | 47.2% | O(n^2)
900 | 2.48677e-15 | 69.6% | O(n^3)
901 | 9.88133e-06 | 132.3% | O(log n)
902 | 5.98793e-05 | 162.5% | O(1)
903 ```
904
905 So in this case @f$ \mathcal{O}(n\log{}n) @f$ provides the best approximation.
906
907 @verbatim embed:rst
908 See the tutorial :ref:`asymptotic-complexity` for details.
909 @endverbatim
910 @return Evaluation results, which can be printed or otherwise inspected.
911 */
912 std::vector<BigO> complexityBigO() const;
913
914 /**
915 * @brief Calculates bigO for a custom function.
916 *
917 * E.g. to calculate the mean squared error for @f$ \mathcal{O}(\log{}\log{}n) @f$, which is not part of the default set of
918 * complexityBigO(), you can do this:
919 *
920 * ```
921 * auto logLogN = bench.complexityBigO("O(log log n)", [](double n) {
922 * return std::log2(std::log2(n));
923 * });
924 * ```
925 *
926 * The resulting mean squared error can be printed with `std::cout << logLogN`. E.g. it prints something like this:
927 *
928 * ```text
929 * 2.46985e-05 * O(log log n), rms=1.48121
930 * ```
931 *
932 * @tparam Op Type of mapping operation.
933 * @param name Name for the function, e.g. "O(log log n)"
934 * @param op Op's operator() maps a `double` with the desired complexity function, e.g. `log2(log2(n))`.
935 * @return BigO Error calculation, which is streamable to std::cout.
936 */
937 template <typename Op>
938 BigO complexityBigO(char const* name, Op op) const;
939
940 template <typename Op>
941 BigO complexityBigO(std::string const& name, Op op) const;
942
943 /*!
944 @verbatim embed:rst
945
946 Convenience shortcut to :cpp:func:`ankerl::nanobench::render`.
947
948 @endverbatim
949 */
950 Bench& render(char const* templateContent, std::ostream& os);
951 Bench& render(std::string const& templateContent, std::ostream& os);
952
953 Bench& config(Config const& benchmarkConfig);
954 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
955
956 private:
957 Config mConfig{};
958 std::vector<Result> mResults{};
959 };
960 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
961
962 /**
963 * @brief Makes sure none of the given arguments are optimized away by the compiler.
964 *
965 * @tparam Arg Type of the argument that shouldn't be optimized away.
966 * @param arg The input that we mark as being used, even though we don't do anything with it.
967 */
968 template <typename Arg>
969 void doNotOptimizeAway(Arg&& arg);
970
971 namespace detail {
972
973 #if defined(_MSC_VER)
974 void doNotOptimizeAwaySink(void const*);
975
976 template <typename T>
977 void doNotOptimizeAway(T const& val);
978
979 #else
980
981 // These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
982 // this seemd to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
983 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
984 template <typename T>
985 void doNotOptimizeAway(T const& val) {
986 // NOLINTNEXTLINE(hicpp-no-assembler)
987 asm volatile("" : : "r,m"(val) : "memory");
988 }
989
990 template <typename T>
991 void doNotOptimizeAway(T& val) {
992 # if defined(__clang__)
993 // NOLINTNEXTLINE(hicpp-no-assembler)
994 asm volatile("" : "+r,m"(val) : : "memory");
995 # else
996 // NOLINTNEXTLINE(hicpp-no-assembler)
997 asm volatile("" : "+m,r"(val) : : "memory");
998 # endif
999 }
1000 #endif
1001
1002 // internally used, but visible because run() is templated.
1003 // Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
1004 // having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)1005 ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
1006 class IterationLogic {
1007 public:
1008 explicit IterationLogic(Bench const& config) noexcept;
1009 ~IterationLogic();
1010
1011 ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
1012 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
1013 void moveResultTo(std::vector<Result>& results) noexcept;
1014
1015 private:
1016 struct Impl;
1017 Impl* mPimpl;
1018 };
1019 ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
1020
ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)1021 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1022 class PerformanceCounters {
1023 public:
1024 PerformanceCounters(PerformanceCounters const&) = delete;
1025 PerformanceCounters& operator=(PerformanceCounters const&) = delete;
1026
1027 PerformanceCounters();
1028 ~PerformanceCounters();
1029
1030 void beginMeasure();
1031 void endMeasure();
1032 void updateResults(uint64_t numIters);
1033
1034 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1035 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1036
1037 private:
1038 #if ANKERL_NANOBENCH(PERF_COUNTERS)
1039 LinuxPerformanceCounters* mPc = nullptr;
1040 #endif
1041 PerfCountSet<uint64_t> mVal{};
1042 PerfCountSet<bool> mHas{};
1043 };
1044 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1045
1046 // Gets the singleton
1047 PerformanceCounters& performanceCounters();
1048
1049 } // namespace detail
1050
1051 class BigO {
1052 public:
1053 using RangeMeasure = std::vector<std::pair<double, double>>;
1054
1055 template <typename Op>
mapRangeMeasure(RangeMeasure data,Op op)1056 static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op) {
1057 for (auto& rangeMeasure : data) {
1058 rangeMeasure.first = op(rangeMeasure.first);
1059 }
1060 return data;
1061 }
1062
1063 static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1064
1065 template <typename Op>
BigO(char const * bigOName,RangeMeasure const & rangeMeasure,Op rangeToN)1066 BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1067 : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1068
1069 template <typename Op>
BigO(std::string const & bigOName,RangeMeasure const & rangeMeasure,Op rangeToN)1070 BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1071 : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1072
1073 BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1074 BigO(std::string const& bigOName, RangeMeasure const& scaledRangeMeasure);
1075 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1076 ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1077 ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1078 ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1079
1080 private:
1081 std::string mName{};
1082 double mConstant{};
1083 double mNormalizedRootMeanSquare{};
1084 };
1085 std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1086 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1087
1088 } // namespace nanobench
1089 } // namespace ankerl
1090
1091 // implementation /////////////////////////////////////////////////////////////////////////////////
1092
1093 namespace ankerl {
1094 namespace nanobench {
1095
uint64_t(Rng::min)1096 constexpr uint64_t(Rng::min)() {
1097 return 0;
1098 }
1099
uint64_t(Rng::max)1100 constexpr uint64_t(Rng::max)() {
1101 return (std::numeric_limits<uint64_t>::max)();
1102 }
1103
1104 ANKERL_NANOBENCH_NO_SANITIZE("integer")
operator()1105 uint64_t Rng::operator()() noexcept {
1106 auto x = mX;
1107
1108 mX = UINT64_C(15241094284759029579) * mY;
1109 mY = rotl(mY - x, 27);
1110
1111 return x;
1112 }
1113
1114 ANKERL_NANOBENCH_NO_SANITIZE("integer")
bounded(uint32_t range)1115 uint32_t Rng::bounded(uint32_t range) noexcept {
1116 uint64_t r32 = static_cast<uint32_t>(operator()());
1117 auto multiresult = r32 * range;
1118 return static_cast<uint32_t>(multiresult >> 32U);
1119 }
1120
uniform01()1121 double Rng::uniform01() noexcept {
1122 auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1123 // can't use union in c++ here for type puning, it's undefined behavior.
1124 // std::memcpy is optimized anyways.
1125 double d;
1126 std::memcpy(&d, &i, sizeof(double));
1127 return d - 1.0;
1128 }
1129
1130 template <typename Container>
shuffle(Container & container)1131 void Rng::shuffle(Container& container) noexcept {
1132 auto size = static_cast<uint32_t>(container.size());
1133 for (auto i = size; i > 1U; --i) {
1134 using std::swap;
1135 auto p = bounded(i); // number in [0, i)
1136 swap(container[i - 1], container[p]);
1137 }
1138 }
1139
rotl(uint64_t x,unsigned k)1140 constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1141 return (x << k) | (x >> (64U - k));
1142 }
1143
1144 template <typename Op>
1145 ANKERL_NANOBENCH_NO_SANITIZE("integer")
run(Op && op)1146 Bench& Bench::run(Op&& op) {
1147 // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1148 detail::IterationLogic iterationLogic(*this);
1149 auto& pc = detail::performanceCounters();
1150
1151 while (auto n = iterationLogic.numIters()) {
1152 pc.beginMeasure();
1153 Clock::time_point before = Clock::now();
1154 while (n-- > 0) {
1155 op();
1156 }
1157 Clock::time_point after = Clock::now();
1158 pc.endMeasure();
1159 pc.updateResults(iterationLogic.numIters());
1160 iterationLogic.add(after - before, pc);
1161 }
1162 iterationLogic.moveResultTo(mResults);
1163 return *this;
1164 }
1165
1166 // Performs all evaluations.
1167 template <typename Op>
run(char const * benchmarkName,Op && op)1168 Bench& Bench::run(char const* benchmarkName, Op&& op) {
1169 name(benchmarkName);
1170 return run(std::forward<Op>(op));
1171 }
1172
1173 template <typename Op>
run(std::string const & benchmarkName,Op && op)1174 Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1175 name(benchmarkName);
1176 return run(std::forward<Op>(op));
1177 }
1178
1179 template <typename Op>
complexityBigO(char const * benchmarkName,Op op)1180 BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1181 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1182 }
1183
1184 template <typename Op>
complexityBigO(std::string const & benchmarkName,Op op)1185 BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1186 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1187 }
1188
1189 // Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1190 // Any argument is cast to double.
1191 template <typename T>
batch(T b)1192 Bench& Bench::batch(T b) noexcept {
1193 mConfig.mBatch = static_cast<double>(b);
1194 return *this;
1195 }
1196
1197 // Sets the computation complexity of the next run. Any argument is cast to double.
1198 template <typename T>
complexityN(T n)1199 Bench& Bench::complexityN(T n) noexcept {
1200 mConfig.mComplexityN = static_cast<double>(n);
1201 return *this;
1202 }
1203
1204 // Convenience: makes sure none of the given arguments are optimized away by the compiler.
1205 template <typename Arg>
doNotOptimizeAway(Arg && arg)1206 Bench& Bench::doNotOptimizeAway(Arg&& arg) {
1207 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1208 return *this;
1209 }
1210
1211 // Makes sure none of the given arguments are optimized away by the compiler.
1212 template <typename Arg>
doNotOptimizeAway(Arg && arg)1213 void doNotOptimizeAway(Arg&& arg) {
1214 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1215 }
1216
1217 namespace detail {
1218
1219 #if defined(_MSC_VER)
1220 template <typename T>
doNotOptimizeAway(T const & val)1221 void doNotOptimizeAway(T const& val) {
1222 doNotOptimizeAwaySink(&val);
1223 }
1224
1225 #endif
1226
1227 } // namespace detail
1228 } // namespace nanobench
1229 } // namespace ankerl
1230
1231 #if defined(ANKERL_NANOBENCH_IMPLEMENT)
1232
1233 ///////////////////////////////////////////////////////////////////////////////////////////////////
1234 // implementation part - only visible in .cpp
1235 ///////////////////////////////////////////////////////////////////////////////////////////////////
1236
1237 # include <algorithm> // sort, reverse
1238 # include <atomic> // compare_exchange_strong in loop overhead
1239 # include <cstdlib> // getenv
1240 # include <cstring> // strstr, strncmp
1241 # include <fstream> // ifstream to parse proc files
1242 # include <iomanip> // setw, setprecision
1243 # include <iostream> // cout
1244 # include <numeric> // accumulate
1245 # include <random> // random_device
1246 # include <sstream> // to_s in Number
1247 # include <stdexcept> // throw for rendering templates
1248 # include <tuple> // std::tie
1249 # if defined(__linux__)
1250 # include <unistd.h> //sysconf
1251 # endif
1252 # if ANKERL_NANOBENCH(PERF_COUNTERS)
1253 # include <map> // map
1254
1255 # include <linux/perf_event.h>
1256 # include <sys/ioctl.h>
1257 # include <sys/syscall.h>
1258 # include <unistd.h>
1259 # endif
1260
1261 // declarations ///////////////////////////////////////////////////////////////////////////////////
1262
1263 namespace ankerl {
1264 namespace nanobench {
1265
1266 // helper stuff that is only intended to be used internally
1267 namespace detail {
1268
1269 struct TableInfo;
1270
1271 // formatting utilities
1272 namespace fmt {
1273
1274 class NumSep;
1275 class StreamStateRestorer;
1276 class Number;
1277 class MarkDownColumn;
1278 class MarkDownCode;
1279
1280 } // namespace fmt
1281 } // namespace detail
1282 } // namespace nanobench
1283 } // namespace ankerl
1284
1285 // definitions ////////////////////////////////////////////////////////////////////////////////////
1286
1287 namespace ankerl {
1288 namespace nanobench {
1289
1290 uint64_t splitMix64(uint64_t& state) noexcept;
1291
1292 namespace detail {
1293
1294 // helpers to get double values
1295 template <typename T>
d(T t)1296 inline double d(T t) noexcept {
1297 return static_cast<double>(t);
1298 }
d(Clock::duration duration)1299 inline double d(Clock::duration duration) noexcept {
1300 return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1301 }
1302
1303 // Calculates clock resolution once, and remembers the result
1304 inline Clock::duration clockResolution() noexcept;
1305
1306 } // namespace detail
1307
1308 namespace templates {
1309
csv()1310 char const* csv() noexcept {
1311 return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1312 {{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1313 {{/result}})DELIM";
1314 }
1315
htmlBoxplot()1316 char const* htmlBoxplot() noexcept {
1317 return R"DELIM(<html>
1318
1319 <head>
1320 <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1321 </head>
1322
1323 <body>
1324 <div id="myDiv"></div>
1325 <script>
1326 var data = [
1327 {{#result}}{
1328 name: '{{name}}',
1329 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1330 },
1331 {{/result}}
1332 ];
1333 var title = '{{title}}';
1334
1335 data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1336 var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1337 </script>
1338 </body>
1339
1340 </html>)DELIM";
1341 }
1342
1343 char const* pyperf() noexcept {
1344 return R"DELIM({
1345 "benchmarks": [
1346 {
1347 "runs": [
1348 {
1349 "values": [
1350 {{#measurement}} {{elapsed}}{{^-last}},
1351 {{/last}}{{/measurement}}
1352 ]
1353 }
1354 ]
1355 }
1356 ],
1357 "metadata": {
1358 "loops": {{sum(iterations)}},
1359 "inner_loops": {{batch}},
1360 "name": "{{title}}",
1361 "unit": "second"
1362 },
1363 "version": "1.0"
1364 })DELIM";
1365 }
1366
1367 char const* json() noexcept {
1368 return R"DELIM({
1369 "results": [
1370 {{#result}} {
1371 "title": "{{title}}",
1372 "name": "{{name}}",
1373 "unit": "{{unit}}",
1374 "batch": {{batch}},
1375 "complexityN": {{complexityN}},
1376 "epochs": {{epochs}},
1377 "clockResolution": {{clockResolution}},
1378 "clockResolutionMultiple": {{clockResolutionMultiple}},
1379 "maxEpochTime": {{maxEpochTime}},
1380 "minEpochTime": {{minEpochTime}},
1381 "minEpochIterations": {{minEpochIterations}},
1382 "epochIterations": {{epochIterations}},
1383 "warmup": {{warmup}},
1384 "relative": {{relative}},
1385 "median(elapsed)": {{median(elapsed)}},
1386 "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1387 "median(instructions)": {{median(instructions)}},
1388 "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1389 "median(cpucycles)": {{median(cpucycles)}},
1390 "median(contextswitches)": {{median(contextswitches)}},
1391 "median(pagefaults)": {{median(pagefaults)}},
1392 "median(branchinstructions)": {{median(branchinstructions)}},
1393 "median(branchmisses)": {{median(branchmisses)}},
1394 "totalTime": {{sumProduct(iterations, elapsed)}},
1395 "measurements": [
1396 {{#measurement}} {
1397 "iterations": {{iterations}},
1398 "elapsed": {{elapsed}},
1399 "pagefaults": {{pagefaults}},
1400 "cpucycles": {{cpucycles}},
1401 "contextswitches": {{contextswitches}},
1402 "instructions": {{instructions}},
1403 "branchinstructions": {{branchinstructions}},
1404 "branchmisses": {{branchmisses}}
1405 }{{^-last}},{{/-last}}
1406 {{/measurement}} ]
1407 }{{^-last}},{{/-last}}
1408 {{/result}} ]
1409 })DELIM";
1410 }
1411
1412 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1413 struct Node {
1414 enum class Type { tag, content, section, inverted_section };
1415
1416 char const* begin;
1417 char const* end;
1418 std::vector<Node> children;
1419 Type type;
1420
1421 template <size_t N>
1422 // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1423 bool operator==(char const (&str)[N]) const noexcept {
1424 return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1425 }
1426 };
1427 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1428
1429 static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1430 std::vector<Node> nodes;
1431
1432 while (true) {
1433 auto begin = std::strstr(*tpl, "{{");
1434 auto end = begin;
1435 if (begin != nullptr) {
1436 begin += 2;
1437 end = std::strstr(begin, "}}");
1438 }
1439
1440 if (begin == nullptr || end == nullptr) {
1441 // nothing found, finish node
1442 nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1443 return nodes;
1444 }
1445
1446 nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1447
1448 // we found a tag
1449 *tpl = end + 2;
1450 switch (*begin) {
1451 case '/':
1452 // finished! bail out
1453 return nodes;
1454
1455 case '#':
1456 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1457 break;
1458
1459 case '^':
1460 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1461 break;
1462
1463 default:
1464 nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1465 break;
1466 }
1467 }
1468 }
1469
1470 static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1471 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1472 bool matchFirst = n == "-first";
1473 bool matchLast = n == "-last";
1474 if (!matchFirst && !matchLast) {
1475 return false;
1476 }
1477
1478 bool doWrite = false;
1479 if (n.type == Node::Type::section) {
1480 doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1481 } else if (n.type == Node::Type::inverted_section) {
1482 doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1483 }
1484
1485 if (doWrite) {
1486 for (auto const& child : n.children) {
1487 if (child.type == Node::Type::content) {
1488 out.write(child.begin, std::distance(child.begin, child.end));
1489 }
1490 }
1491 }
1492 return true;
1493 }
1494
1495 static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1496 matchResult.clear();
1497 auto idxOpen = str.find('(');
1498 auto idxClose = str.find(')', idxOpen);
1499 if (idxClose == std::string::npos) {
1500 return false;
1501 }
1502
1503 matchResult.emplace_back(str.substr(0, idxOpen));
1504
1505 // split by comma
1506 matchResult.emplace_back(std::string{});
1507 for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1508 if (str[i] == ' ' || str[i] == '\t') {
1509 // skip whitespace
1510 continue;
1511 }
1512 if (str[i] == ',') {
1513 // got a comma => new string
1514 matchResult.emplace_back(std::string{});
1515 continue;
1516 }
1517 // no whitespace no comma, append
1518 matchResult.back() += str[i];
1519 }
1520 return true;
1521 }
1522
1523 static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1524 using detail::d;
1525
1526 if (n == "title") {
1527 out << config.mBenchmarkTitle;
1528 return true;
1529 } else if (n == "name") {
1530 out << config.mBenchmarkName;
1531 return true;
1532 } else if (n == "unit") {
1533 out << config.mUnit;
1534 return true;
1535 } else if (n == "batch") {
1536 out << config.mBatch;
1537 return true;
1538 } else if (n == "complexityN") {
1539 out << config.mComplexityN;
1540 return true;
1541 } else if (n == "epochs") {
1542 out << config.mNumEpochs;
1543 return true;
1544 } else if (n == "clockResolution") {
1545 out << d(detail::clockResolution());
1546 return true;
1547 } else if (n == "clockResolutionMultiple") {
1548 out << config.mClockResolutionMultiple;
1549 return true;
1550 } else if (n == "maxEpochTime") {
1551 out << d(config.mMaxEpochTime);
1552 return true;
1553 } else if (n == "minEpochTime") {
1554 out << d(config.mMinEpochTime);
1555 return true;
1556 } else if (n == "minEpochIterations") {
1557 out << config.mMinEpochIterations;
1558 return true;
1559 } else if (n == "epochIterations") {
1560 out << config.mEpochIterations;
1561 return true;
1562 } else if (n == "warmup") {
1563 out << config.mWarmup;
1564 return true;
1565 } else if (n == "relative") {
1566 out << config.mIsRelative;
1567 return true;
1568 }
1569 return false;
1570 }
1571
1572 static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1573 if (generateConfigTag(n, r.config(), out)) {
1574 return out;
1575 }
1576 // match e.g. "median(elapsed)"
1577 // g++ 4.8 doesn't implement std::regex :(
1578 // static std::regex const regOpArg1("^([a-zA-Z]+)\\(([a-zA-Z]*)\\)$");
1579 // std::cmatch matchResult;
1580 // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1581 std::vector<std::string> matchResult;
1582 if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1583 if (matchResult.size() == 2) {
1584 auto m = Result::fromString(matchResult[1]);
1585 if (m == Result::Measure::_size) {
1586 return out << 0.0;
1587 }
1588
1589 if (matchResult[0] == "median") {
1590 return out << r.median(m);
1591 }
1592 if (matchResult[0] == "average") {
1593 return out << r.average(m);
1594 }
1595 if (matchResult[0] == "medianAbsolutePercentError") {
1596 return out << r.medianAbsolutePercentError(m);
1597 }
1598 if (matchResult[0] == "sum") {
1599 return out << r.sum(m);
1600 }
1601 if (matchResult[0] == "minimum") {
1602 return out << r.minimum(m);
1603 }
1604 if (matchResult[0] == "maximum") {
1605 return out << r.maximum(m);
1606 }
1607 } else if (matchResult.size() == 3) {
1608 auto m1 = Result::fromString(matchResult[1]);
1609 auto m2 = Result::fromString(matchResult[2]);
1610 if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) {
1611 return out << 0.0;
1612 }
1613
1614 if (matchResult[0] == "sumProduct") {
1615 return out << r.sumProduct(m1, m2);
1616 }
1617 }
1618 }
1619
1620 // match e.g. "sumProduct(elapsed, iterations)"
1621 // static std::regex const regOpArg2("^([a-zA-Z]+)\\(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\)$");
1622
1623 // nothing matches :(
1624 throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1625 }
1626
1627 static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1628 for (auto const& n : nodes) {
1629 if (!generateFirstLast(n, idx, r.size(), out)) {
1630 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1631 switch (n.type) {
1632 case Node::Type::content:
1633 out.write(n.begin, std::distance(n.begin, n.end));
1634 break;
1635
1636 case Node::Type::inverted_section:
1637 throw std::runtime_error("got a inverted section inside measurement");
1638
1639 case Node::Type::section:
1640 throw std::runtime_error("got a section inside measurement");
1641
1642 case Node::Type::tag: {
1643 auto m = Result::fromString(std::string(n.begin, n.end));
1644 if (m == Result::Measure::_size || !r.has(m)) {
1645 out << 0.0;
1646 } else {
1647 out << r.get(idx, m);
1648 }
1649 break;
1650 }
1651 }
1652 }
1653 }
1654 }
1655
1656 static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1657 auto const& r = results[idx];
1658 for (auto const& n : nodes) {
1659 if (!generateFirstLast(n, idx, results.size(), out)) {
1660 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1661 switch (n.type) {
1662 case Node::Type::content:
1663 out.write(n.begin, std::distance(n.begin, n.end));
1664 break;
1665
1666 case Node::Type::inverted_section:
1667 throw std::runtime_error("got a inverted section inside result");
1668
1669 case Node::Type::section:
1670 if (n == "measurement") {
1671 for (size_t i = 0; i < r.size(); ++i) {
1672 generateResultMeasurement(n.children, i, r, out);
1673 }
1674 } else {
1675 throw std::runtime_error("got a section inside result");
1676 }
1677 break;
1678
1679 case Node::Type::tag:
1680 generateResultTag(n, r, out);
1681 break;
1682 }
1683 }
1684 }
1685 }
1686
1687 } // namespace templates
1688
1689 // helper stuff that only intended to be used internally
1690 namespace detail {
1691
1692 char const* getEnv(char const* name);
1693 bool isEndlessRunning(std::string const& name);
1694 bool isWarningsEnabled();
1695
1696 template <typename T>
1697 T parseFile(std::string const& filename);
1698
1699 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1700 void printStabilityInformationOnce(std::ostream* os);
1701
1702 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1703 uint64_t& singletonHeaderHash() noexcept;
1704
1705 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1706 Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1707
1708 // formatting utilities
1709 namespace fmt {
1710
1711 // adds thousands separator to numbers
1712 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1713 class NumSep : public std::numpunct<char> {
1714 public:
1715 explicit NumSep(char sep);
1716 char do_thousands_sep() const override;
1717 std::string do_grouping() const override;
1718
1719 private:
1720 char mSep;
1721 };
1722 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1723
1724 // RAII to save & restore a stream's state
1725 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1726 class StreamStateRestorer {
1727 public:
1728 explicit StreamStateRestorer(std::ostream& s);
1729 ~StreamStateRestorer();
1730
1731 // sets back all stream info that we remembered at construction
1732 void restore();
1733
1734 // don't allow copying / moving
1735 StreamStateRestorer(StreamStateRestorer const&) = delete;
1736 StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1737 StreamStateRestorer(StreamStateRestorer&&) = delete;
1738 StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1739
1740 private:
1741 std::ostream& mStream;
1742 std::locale mLocale;
1743 std::streamsize const mPrecision;
1744 std::streamsize const mWidth;
1745 std::ostream::char_type const mFill;
1746 std::ostream::fmtflags const mFmtFlags;
1747 };
1748 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1749
1750 // Number formatter
1751 class Number {
1752 public:
1753 Number(int width, int precision, double value);
1754 Number(int width, int precision, int64_t value);
1755 std::string to_s() const;
1756
1757 private:
1758 friend std::ostream& operator<<(std::ostream& os, Number const& n);
1759 std::ostream& write(std::ostream& os) const;
1760
1761 int mWidth;
1762 int mPrecision;
1763 double mValue;
1764 };
1765
1766 // helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1767 std::string to_s(uint64_t s);
1768
1769 std::ostream& operator<<(std::ostream& os, Number const& n);
1770
1771 class MarkDownColumn {
1772 public:
1773 MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val);
1774 std::string title() const;
1775 std::string separator() const;
1776 std::string invalid() const;
1777 std::string value() const;
1778
1779 private:
1780 int mWidth;
1781 int mPrecision;
1782 std::string mTitle;
1783 std::string mSuffix;
1784 double mValue;
1785 };
1786
1787 // Formats any text as markdown code, escaping backticks.
1788 class MarkDownCode {
1789 public:
1790 explicit MarkDownCode(std::string const& what);
1791
1792 private:
1793 friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1794 std::ostream& write(std::ostream& os) const;
1795
1796 std::string mWhat{};
1797 };
1798
1799 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1800
1801 } // namespace fmt
1802 } // namespace detail
1803 } // namespace nanobench
1804 } // namespace ankerl
1805
1806 // implementation /////////////////////////////////////////////////////////////////////////////////
1807
1808 namespace ankerl {
1809 namespace nanobench {
1810
1811 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1812 detail::fmt::StreamStateRestorer restorer(out);
1813
1814 out.precision(std::numeric_limits<double>::digits10);
1815 auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1816
1817 for (auto const& n : nodes) {
1818 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1819 switch (n.type) {
1820 case templates::Node::Type::content:
1821 out.write(n.begin, std::distance(n.begin, n.end));
1822 break;
1823
1824 case templates::Node::Type::inverted_section:
1825 throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
1826
1827 case templates::Node::Type::section:
1828 if (n == "result") {
1829 const size_t nbResults = results.size();
1830 for (size_t i = 0; i < nbResults; ++i) {
1831 generateResult(n.children, i, results, out);
1832 }
1833 } else if (n == "measurement") {
1834 if (results.size() != 1) {
1835 throw std::runtime_error(
1836 "render: can only use section 'measurement' here if there is a single result, but there are " +
1837 detail::fmt::to_s(results.size()));
1838 }
1839 // when we only have a single result, we can immediately go into its measurement.
1840 auto const& r = results.front();
1841 for (size_t i = 0; i < r.size(); ++i) {
1842 generateResultMeasurement(n.children, i, r, out);
1843 }
1844 } else {
1845 throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
1846 }
1847 break;
1848
1849 case templates::Node::Type::tag:
1850 if (results.size() == 1) {
1851 // result & config are both supported there
1852 generateResultTag(n, results.front(), out);
1853 } else {
1854 // This just uses the last result's config.
1855 if (!generateConfigTag(n, results.back().config(), out)) {
1856 throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
1857 }
1858 }
1859 break;
1860 }
1861 }
1862 }
1863
1864 void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1865 render(mustacheTemplate.c_str(), results, out);
1866 }
1867
1868 void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
1869 render(mustacheTemplate, bench.results(), out);
1870 }
1871
1872 void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
1873 render(mustacheTemplate.c_str(), bench.results(), out);
1874 }
1875
1876 namespace detail {
1877
1878 PerformanceCounters& performanceCounters() {
1879 # if defined(__clang__)
1880 # pragma clang diagnostic push
1881 # pragma clang diagnostic ignored "-Wexit-time-destructors"
1882 # endif
1883 static PerformanceCounters pc;
1884 # if defined(__clang__)
1885 # pragma clang diagnostic pop
1886 # endif
1887 return pc;
1888 }
1889
1890 // Windows version of doNotOptimizeAway
1891 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
1892 // see https://github.com/facebook/folly/blob/master/folly/Benchmark.h#L280
1893 // see https://docs.microsoft.com/en-us/cpp/preprocessor/optimize
1894 # if defined(_MSC_VER)
1895 # pragma optimize("", off)
1896 void doNotOptimizeAwaySink(void const*) {}
1897 # pragma optimize("", on)
1898 # endif
1899
1900 template <typename T>
1901 T parseFile(std::string const& filename) {
1902 std::ifstream fin(filename);
1903 T num{};
1904 fin >> num;
1905 return num;
1906 }
1907
1908 char const* getEnv(char const* name) {
1909 # if defined(_MSC_VER)
1910 # pragma warning(push)
1911 # pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
1912 # endif
1913 return std::getenv(name);
1914 # if defined(_MSC_VER)
1915 # pragma warning(pop)
1916 # endif
1917 }
1918
1919 bool isEndlessRunning(std::string const& name) {
1920 auto endless = getEnv("NANOBENCH_ENDLESS");
1921 return nullptr != endless && endless == name;
1922 }
1923
1924 // True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
1925 bool isWarningsEnabled() {
1926 auto suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
1927 return nullptr == suppression || suppression == std::string("0");
1928 }
1929
1930 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
1931 warnings.clear();
1932 recommendations.clear();
1933
1934 bool recommendCheckFlags = false;
1935
1936 # if defined(DEBUG)
1937 warnings.emplace_back("DEBUG defined");
1938 recommendCheckFlags = true;
1939 # endif
1940
1941 bool recommendPyPerf = false;
1942 # if defined(__linux__)
1943 auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
1944 if (nprocs <= 0) {
1945 warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
1946 } else {
1947
1948 // check frequency scaling
1949 for (long id = 0; id < nprocs; ++id) {
1950 auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
1951 auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
1952 auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq");
1953 auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq");
1954 if (minFreq != maxFreq) {
1955 auto minMHz = static_cast<double>(minFreq) / 1000.0;
1956 auto maxMHz = static_cast<double>(maxFreq) / 1000.0;
1957 warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
1958 detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
1959 " MHz");
1960 recommendPyPerf = true;
1961 break;
1962 }
1963 }
1964
1965 auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
1966 if ("performance" != currentGovernor) {
1967 warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
1968 recommendPyPerf = true;
1969 }
1970
1971 if (0 == parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo")) {
1972 warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
1973 recommendPyPerf = true;
1974 }
1975 }
1976 # endif
1977
1978 if (recommendCheckFlags) {
1979 recommendations.emplace_back("Make sure you compile for Release");
1980 }
1981 if (recommendPyPerf) {
1982 recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
1983 }
1984 }
1985
1986 void printStabilityInformationOnce(std::ostream* outStream) {
1987 static bool shouldPrint = true;
1988 if (shouldPrint && outStream && isWarningsEnabled()) {
1989 auto& os = *outStream;
1990 shouldPrint = false;
1991 std::vector<std::string> warnings;
1992 std::vector<std::string> recommendations;
1993 gatherStabilityInformation(warnings, recommendations);
1994 if (warnings.empty()) {
1995 return;
1996 }
1997
1998 os << "Warning, results might be unstable:" << std::endl;
1999 for (auto const& w : warnings) {
2000 os << "* " << w << std::endl;
2001 }
2002
2003 os << std::endl << "Recommendations" << std::endl;
2004 for (auto const& r : recommendations) {
2005 os << "* " << r << std::endl;
2006 }
2007 }
2008 }
2009
2010 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
2011 uint64_t& singletonHeaderHash() noexcept {
2012 static uint64_t sHeaderHash{};
2013 return sHeaderHash;
2014 }
2015
2016 ANKERL_NANOBENCH_NO_SANITIZE("integer")
2017 inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2018 return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2019 }
2020
2021 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
2022 Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
2023 auto bestDuration = Clock::duration::max();
2024 Clock::time_point tBegin;
2025 Clock::time_point tEnd;
2026 for (size_t i = 0; i < numEvaluations; ++i) {
2027 tBegin = Clock::now();
2028 do {
2029 tEnd = Clock::now();
2030 } while (tBegin == tEnd);
2031 bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2032 }
2033 return bestDuration;
2034 }
2035
2036 // Calculates clock resolution once, and remembers the result
2037 Clock::duration clockResolution() noexcept {
2038 static Clock::duration sResolution = calcClockResolution(20);
2039 return sResolution;
2040 }
2041
2042 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2043 struct IterationLogic::Impl {
2044 enum class State { warmup, upscaling_runtime, measuring, endless };
2045
2046 explicit Impl(Bench const& bench)
2047 : mBench(bench)
2048 , mResult(bench.config()) {
2049 printStabilityInformationOnce(mBench.output());
2050
2051 // determine target runtime per epoch
2052 mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2053 if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2054 mTargetRuntimePerEpoch = mBench.maxEpochTime();
2055 }
2056 if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2057 mTargetRuntimePerEpoch = mBench.minEpochTime();
2058 }
2059
2060 if (isEndlessRunning(mBench.name())) {
2061 std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
2062 mNumIters = (std::numeric_limits<uint64_t>::max)();
2063 mState = State::endless;
2064 } else if (0 != mBench.warmup()) {
2065 mNumIters = mBench.warmup();
2066 mState = State::warmup;
2067 } else if (0 != mBench.epochIterations()) {
2068 // exact number of iterations
2069 mNumIters = mBench.epochIterations();
2070 mState = State::measuring;
2071 } else {
2072 mNumIters = mBench.minEpochIterations();
2073 mState = State::upscaling_runtime;
2074 }
2075 }
2076
2077 // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
2078 ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2079 auto doubleElapsed = d(elapsed);
2080 auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2081 auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2082
2083 auto doubleMinEpochIters = d(mBench.minEpochIterations());
2084 if (doubleNewIters < doubleMinEpochIters) {
2085 doubleNewIters = doubleMinEpochIters;
2086 }
2087 doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2088
2089 // +0.5 for correct rounding when casting
2090 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2091 return static_cast<uint64_t>(doubleNewIters + 0.5);
2092 }
2093
2094 ANKERL_NANOBENCH_NO_SANITIZE("integer") void upscale(std::chrono::nanoseconds elapsed) {
2095 if (elapsed * 10 < mTargetRuntimePerEpoch) {
2096 // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2097 if (mNumIters * 10 < mNumIters) {
2098 // overflow :-(
2099 showResult("iterations overflow. Maybe your code got optimized away?");
2100 mNumIters = 0;
2101 return;
2102 }
2103 mNumIters *= 10;
2104 } else {
2105 mNumIters = calcBestNumIters(elapsed, mNumIters);
2106 }
2107 }
2108
2109 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2110 # if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2111 auto oldIters = mNumIters;
2112 # endif
2113
2114 switch (mState) {
2115 case State::warmup:
2116 if (isCloseEnoughForMeasurements(elapsed)) {
2117 // if elapsed is close enough, we can skip upscaling and go right to measurements
2118 // still, we don't add the result to the measurements.
2119 mState = State::measuring;
2120 mNumIters = calcBestNumIters(elapsed, mNumIters);
2121 } else {
2122 // not close enough: switch to upscaling
2123 mState = State::upscaling_runtime;
2124 upscale(elapsed);
2125 }
2126 break;
2127
2128 case State::upscaling_runtime:
2129 if (isCloseEnoughForMeasurements(elapsed)) {
2130 // if we are close enough, add measurement and switch to always measuring
2131 mState = State::measuring;
2132 mTotalElapsed += elapsed;
2133 mTotalNumIters += mNumIters;
2134 mResult.add(elapsed, mNumIters, pc);
2135 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2136 } else {
2137 upscale(elapsed);
2138 }
2139 break;
2140
2141 case State::measuring:
2142 // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2143 // that fluctuation, or else we would bias the result
2144 mTotalElapsed += elapsed;
2145 mTotalNumIters += mNumIters;
2146 mResult.add(elapsed, mNumIters, pc);
2147 if (0 != mBench.epochIterations()) {
2148 mNumIters = mBench.epochIterations();
2149 } else {
2150 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2151 }
2152 break;
2153
2154 case State::endless:
2155 mNumIters = (std::numeric_limits<uint64_t>::max)();
2156 break;
2157 }
2158
2159 if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2160 // we got all the results that we need, finish it
2161 showResult("");
2162 mNumIters = 0;
2163 }
2164
2165 ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, static_cast<double>(elapsed.count())) << " elapsed, "
2166 << detail::fmt::Number(20, 3, static_cast<double>(mTargetRuntimePerEpoch.count()))
2167 << " target. oldIters=" << oldIters << ", mNumIters=" << mNumIters
2168 << ", mState=" << static_cast<int>(mState));
2169 }
2170
2171 void showResult(std::string const& errorMessage) const {
2172 ANKERL_NANOBENCH_LOG(errorMessage);
2173
2174 if (mBench.output() != nullptr) {
2175 // prepare column data ///////
2176 std::vector<fmt::MarkDownColumn> columns;
2177
2178 auto rMedian = mResult.median(Result::Measure::elapsed);
2179
2180 if (mBench.relative()) {
2181 double d = 100.0;
2182 if (!mBench.results().empty()) {
2183 d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2184 }
2185 columns.emplace_back(11, 1, "relative", "%", d);
2186 }
2187
2188 if (mBench.complexityN() > 0) {
2189 columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2190 }
2191
2192 columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
2193 rMedian / (mBench.timeUnit().count() * mBench.batch()));
2194 columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2195
2196 double rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2197 columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2198
2199 double rInsMedian = -1.0;
2200 if (mResult.has(Result::Measure::instructions)) {
2201 rInsMedian = mResult.median(Result::Measure::instructions);
2202 columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2203 }
2204
2205 double rCycMedian = -1.0;
2206 if (mResult.has(Result::Measure::cpucycles)) {
2207 rCycMedian = mResult.median(Result::Measure::cpucycles);
2208 columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2209 }
2210 if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2211 columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2212 }
2213 if (mResult.has(Result::Measure::branchinstructions)) {
2214 double rBraMedian = mResult.median(Result::Measure::branchinstructions);
2215 columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2216 if (mResult.has(Result::Measure::branchmisses)) {
2217 double p = 0.0;
2218 if (rBraMedian >= 1e-9) {
2219 p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2220 }
2221 columns.emplace_back(10, 1, "miss%", "%", p);
2222 }
2223 }
2224
2225 columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
2226
2227 // write everything
2228 auto& os = *mBench.output();
2229
2230 // combine all elements that are relevant for printing the header
2231 uint64_t hash = 0;
2232 hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2233 hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2234 hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2235 hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2236 hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2237 hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2238
2239 if (hash != singletonHeaderHash()) {
2240 singletonHeaderHash() = hash;
2241
2242 // no result yet, print header
2243 os << std::endl;
2244 for (auto const& col : columns) {
2245 os << col.title();
2246 }
2247 os << "| " << mBench.title() << std::endl;
2248
2249 for (auto const& col : columns) {
2250 os << col.separator();
2251 }
2252 os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2253 }
2254
2255 if (!errorMessage.empty()) {
2256 for (auto const& col : columns) {
2257 os << col.invalid();
2258 }
2259 os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2260 } else {
2261 for (auto const& col : columns) {
2262 os << col.value();
2263 }
2264 os << "| ";
2265 auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2266 if (showUnstable) {
2267 os << ":wavy_dash: ";
2268 }
2269 os << fmt::MarkDownCode(mBench.name());
2270 if (showUnstable) {
2271 auto avgIters = static_cast<double>(mTotalNumIters) / static_cast<double>(mBench.epochs());
2272 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2273 auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2274
2275 os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2276 << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2277 }
2278 os << std::endl;
2279 }
2280 }
2281 }
2282
2283 ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2284 return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2285 }
2286
2287 uint64_t mNumIters = 1;
2288 Bench const& mBench;
2289 std::chrono::nanoseconds mTargetRuntimePerEpoch{};
2290 Result mResult;
2291 Rng mRng{123};
2292 std::chrono::nanoseconds mTotalElapsed{};
2293 uint64_t mTotalNumIters = 0;
2294
2295 State mState = State::upscaling_runtime;
2296 };
ANKERL_NANOBENCH(IGNORE_PADDED_POP)2297 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2298
2299 IterationLogic::IterationLogic(Bench const& bench) noexcept
2300 : mPimpl(new Impl(bench)) {}
2301
~IterationLogic()2302 IterationLogic::~IterationLogic() {
2303 if (mPimpl) {
2304 delete mPimpl;
2305 }
2306 }
2307
numIters()2308 uint64_t IterationLogic::numIters() const noexcept {
2309 ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2310 return mPimpl->mNumIters;
2311 }
2312
add(std::chrono::nanoseconds elapsed,PerformanceCounters const & pc)2313 void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2314 mPimpl->add(elapsed, pc);
2315 }
2316
moveResultTo(std::vector<Result> & results)2317 void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2318 results.emplace_back(std::move(mPimpl->mResult));
2319 }
2320
2321 # if ANKERL_NANOBENCH(PERF_COUNTERS)
2322
ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)2323 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2324 class LinuxPerformanceCounters {
2325 public:
2326 struct Target {
2327 Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2328 : targetValue(targetValue_)
2329 , correctMeasuringOverhead(correctMeasuringOverhead_)
2330 , correctLoopOverhead(correctLoopOverhead_) {}
2331
2332 uint64_t* targetValue{};
2333 bool correctMeasuringOverhead{};
2334 bool correctLoopOverhead{};
2335 };
2336
2337 ~LinuxPerformanceCounters();
2338
2339 // quick operation
2340 inline void start() {}
2341
2342 inline void stop() {}
2343
2344 bool monitor(perf_sw_ids swId, Target target);
2345 bool monitor(perf_hw_id hwId, Target target);
2346
2347 bool hasError() const noexcept {
2348 return mHasError;
2349 }
2350
2351 // Just reading data is faster than enable & disabling.
2352 // we subtract data ourselves.
2353 inline void beginMeasure() {
2354 if (mHasError) {
2355 return;
2356 }
2357
2358 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2359 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2360 if (mHasError) {
2361 return;
2362 }
2363
2364 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2365 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2366 }
2367
2368 inline void endMeasure() {
2369 if (mHasError) {
2370 return;
2371 }
2372
2373 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2374 mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2375 if (mHasError) {
2376 return;
2377 }
2378
2379 auto const numBytes = sizeof(uint64_t) * mCounters.size();
2380 auto ret = read(mFd, mCounters.data(), numBytes);
2381 mHasError = ret != static_cast<ssize_t>(numBytes);
2382 }
2383
2384 void updateResults(uint64_t numIters);
2385
2386 // rounded integer division
2387 template <typename T>
2388 static inline T divRounded(T a, T divisor) {
2389 return (a + divisor / 2) / divisor;
2390 }
2391
2392 template <typename Op>
2393 ANKERL_NANOBENCH_NO_SANITIZE("integer")
2394 void calibrate(Op&& op) {
2395 // clear current calibration data,
2396 for (auto& v : mCalibratedOverhead) {
2397 v = UINT64_C(0);
2398 }
2399
2400 // create new calibration data
2401 auto newCalibration = mCalibratedOverhead;
2402 for (auto& v : newCalibration) {
2403 v = (std::numeric_limits<uint64_t>::max)();
2404 }
2405 for (size_t iter = 0; iter < 100; ++iter) {
2406 beginMeasure();
2407 op();
2408 endMeasure();
2409 if (mHasError) {
2410 return;
2411 }
2412
2413 for (size_t i = 0; i < newCalibration.size(); ++i) {
2414 auto diff = mCounters[i];
2415 if (newCalibration[i] > diff) {
2416 newCalibration[i] = diff;
2417 }
2418 }
2419 }
2420
2421 mCalibratedOverhead = std::move(newCalibration);
2422
2423 {
2424 // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2425 // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2426 // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2427 // see https://godbolt.org/z/49RVQ5
2428 uint64_t const numIters = 100000U + (std::random_device{}() & 3);
2429 uint64_t n = numIters;
2430 uint32_t x = 1234567;
2431 auto fn = [&]() {
2432 x ^= x << 13;
2433 x ^= x >> 17;
2434 x ^= x << 5;
2435 };
2436
2437 beginMeasure();
2438 while (n-- > 0) {
2439 fn();
2440 }
2441 endMeasure();
2442 detail::doNotOptimizeAway(x);
2443 auto measure1 = mCounters;
2444
2445 n = numIters;
2446 beginMeasure();
2447 while (n-- > 0) {
2448 // we now run *twice* so we can easily calculate the overhead
2449 fn();
2450 fn();
2451 }
2452 endMeasure();
2453 detail::doNotOptimizeAway(x);
2454 auto measure2 = mCounters;
2455
2456 for (size_t i = 0; i < mCounters.size(); ++i) {
2457 // factor 2 because we have two instructions per loop
2458 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2459 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2460 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2461
2462 mLoopOverhead[i] = divRounded(overhead, numIters);
2463 }
2464 }
2465 }
2466
2467 private:
2468 bool monitor(uint32_t type, uint64_t eventid, Target target);
2469
2470 std::map<uint64_t, Target> mIdToTarget{};
2471
2472 // start with minimum size of 3 for read_format
2473 std::vector<uint64_t> mCounters{3};
2474 std::vector<uint64_t> mCalibratedOverhead{3};
2475 std::vector<uint64_t> mLoopOverhead{3};
2476
2477 uint64_t mTimeEnabledNanos = 0;
2478 uint64_t mTimeRunningNanos = 0;
2479 int mFd = -1;
2480 bool mHasError = false;
2481 };
ANKERL_NANOBENCH(IGNORE_PADDED_POP)2482 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2483
2484 LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2485 if (-1 != mFd) {
2486 close(mFd);
2487 }
2488 }
2489
monitor(perf_sw_ids swId,LinuxPerformanceCounters::Target target)2490 bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2491 return monitor(PERF_TYPE_SOFTWARE, swId, target);
2492 }
2493
monitor(perf_hw_id hwId,LinuxPerformanceCounters::Target target)2494 bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2495 return monitor(PERF_TYPE_HARDWARE, hwId, target);
2496 }
2497
2498 // overflow is ok, it's checked
2499 ANKERL_NANOBENCH_NO_SANITIZE("integer")
updateResults(uint64_t numIters)2500 void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2501 // clear old data
2502 for (auto& id_value : mIdToTarget) {
2503 *id_value.second.targetValue = UINT64_C(0);
2504 }
2505
2506 if (mHasError) {
2507 return;
2508 }
2509
2510 mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2511 mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2512
2513 for (uint64_t i = 0; i < mCounters[0]; ++i) {
2514 auto idx = static_cast<size_t>(3 + i * 2 + 0);
2515 auto id = mCounters[idx + 1U];
2516
2517 auto it = mIdToTarget.find(id);
2518 if (it != mIdToTarget.end()) {
2519
2520 auto& tgt = it->second;
2521 *tgt.targetValue = mCounters[idx];
2522 if (tgt.correctMeasuringOverhead) {
2523 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2524 *tgt.targetValue -= mCalibratedOverhead[idx];
2525 } else {
2526 *tgt.targetValue = 0U;
2527 }
2528 }
2529 if (tgt.correctLoopOverhead) {
2530 auto correctionVal = mLoopOverhead[idx] * numIters;
2531 if (*tgt.targetValue >= correctionVal) {
2532 *tgt.targetValue -= correctionVal;
2533 } else {
2534 *tgt.targetValue = 0U;
2535 }
2536 }
2537 }
2538 }
2539 }
2540
monitor(uint32_t type,uint64_t eventid,Target target)2541 bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2542 *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2543 if (mHasError) {
2544 return false;
2545 }
2546
2547 auto pea = perf_event_attr();
2548 std::memset(&pea, 0, sizeof(perf_event_attr));
2549 pea.type = type;
2550 pea.size = sizeof(perf_event_attr);
2551 pea.config = eventid;
2552 pea.disabled = 1; // start counter as disabled
2553 pea.exclude_kernel = 1;
2554 pea.exclude_hv = 1;
2555
2556 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2557 pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2558
2559 const int pid = 0; // the current process
2560 const int cpu = -1; // all CPUs
2561 # if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2562 const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2563 # else
2564 const unsigned long flags = 0;
2565 # endif
2566
2567 auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2568 if (-1 == fd) {
2569 return false;
2570 }
2571 if (-1 == mFd) {
2572 // first call: set to fd, and use this from now on
2573 mFd = fd;
2574 }
2575 uint64_t id = 0;
2576 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2577 if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2578 // couldn't get id
2579 return false;
2580 }
2581
2582 // insert into map, rely on the fact that map's references are constant.
2583 mIdToTarget.emplace(id, target);
2584
2585 // prepare readformat with the correct size (after the insert)
2586 auto size = 3 + 2 * mIdToTarget.size();
2587 mCounters.resize(size);
2588 mCalibratedOverhead.resize(size);
2589 mLoopOverhead.resize(size);
2590
2591 return true;
2592 }
2593
PerformanceCounters()2594 PerformanceCounters::PerformanceCounters()
2595 : mPc(new LinuxPerformanceCounters())
2596 , mVal()
2597 , mHas() {
2598
2599 mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2600 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2601 mHas.contextSwitches =
2602 mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2603 mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2604 mHas.branchInstructions =
2605 mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2606 mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2607 // mHas.branchMisses = false;
2608
2609 mPc->start();
2610 mPc->calibrate([] {
2611 auto before = ankerl::nanobench::Clock::now();
2612 auto after = ankerl::nanobench::Clock::now();
2613 (void)before;
2614 (void)after;
2615 });
2616
2617 if (mPc->hasError()) {
2618 // something failed, don't monitor anything.
2619 mHas = PerfCountSet<bool>{};
2620 }
2621 }
2622
~PerformanceCounters()2623 PerformanceCounters::~PerformanceCounters() {
2624 if (nullptr != mPc) {
2625 delete mPc;
2626 }
2627 }
2628
beginMeasure()2629 void PerformanceCounters::beginMeasure() {
2630 mPc->beginMeasure();
2631 }
2632
endMeasure()2633 void PerformanceCounters::endMeasure() {
2634 mPc->endMeasure();
2635 }
2636
updateResults(uint64_t numIters)2637 void PerformanceCounters::updateResults(uint64_t numIters) {
2638 mPc->updateResults(numIters);
2639 }
2640
2641 # else
2642
2643 PerformanceCounters::PerformanceCounters() = default;
2644 PerformanceCounters::~PerformanceCounters() = default;
beginMeasure()2645 void PerformanceCounters::beginMeasure() {}
endMeasure()2646 void PerformanceCounters::endMeasure() {}
updateResults(uint64_t)2647 void PerformanceCounters::updateResults(uint64_t) {}
2648
2649 # endif
2650
ANKERL_NANOBENCH(NODISCARD)2651 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2652 return mVal;
2653 }
ANKERL_NANOBENCH(NODISCARD)2654 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2655 return mHas;
2656 }
2657
2658 // formatting utilities
2659 namespace fmt {
2660
2661 // adds thousands separator to numbers
NumSep(char sep)2662 NumSep::NumSep(char sep)
2663 : mSep(sep) {}
2664
do_thousands_sep()2665 char NumSep::do_thousands_sep() const {
2666 return mSep;
2667 }
2668
do_grouping()2669 std::string NumSep::do_grouping() const {
2670 return "\003";
2671 }
2672
2673 // RAII to save & restore a stream's state
StreamStateRestorer(std::ostream & s)2674 StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2675 : mStream(s)
2676 , mLocale(s.getloc())
2677 , mPrecision(s.precision())
2678 , mWidth(s.width())
2679 , mFill(s.fill())
2680 , mFmtFlags(s.flags()) {}
2681
~StreamStateRestorer()2682 StreamStateRestorer::~StreamStateRestorer() {
2683 restore();
2684 }
2685
2686 // sets back all stream info that we remembered at construction
restore()2687 void StreamStateRestorer::restore() {
2688 mStream.imbue(mLocale);
2689 mStream.precision(mPrecision);
2690 mStream.width(mWidth);
2691 mStream.fill(mFill);
2692 mStream.flags(mFmtFlags);
2693 }
2694
Number(int width,int precision,int64_t value)2695 Number::Number(int width, int precision, int64_t value)
2696 : mWidth(width)
2697 , mPrecision(precision)
2698 , mValue(static_cast<double>(value)) {}
2699
Number(int width,int precision,double value)2700 Number::Number(int width, int precision, double value)
2701 : mWidth(width)
2702 , mPrecision(precision)
2703 , mValue(value) {}
2704
write(std::ostream & os)2705 std::ostream& Number::write(std::ostream& os) const {
2706 StreamStateRestorer restorer(os);
2707 os.imbue(std::locale(os.getloc(), new NumSep(',')));
2708 os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2709 return os;
2710 }
2711
to_s()2712 std::string Number::to_s() const {
2713 std::stringstream ss;
2714 write(ss);
2715 return ss.str();
2716 }
2717
to_s(uint64_t n)2718 std::string to_s(uint64_t n) {
2719 std::string str;
2720 do {
2721 str += static_cast<char>('0' + static_cast<char>(n % 10));
2722 n /= 10;
2723 } while (n != 0);
2724 std::reverse(str.begin(), str.end());
2725 return str;
2726 }
2727
2728 std::ostream& operator<<(std::ostream& os, Number const& n) {
2729 return n.write(os);
2730 }
2731
MarkDownColumn(int w,int prec,std::string const & tit,std::string const & suff,double val)2732 MarkDownColumn::MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val)
2733 : mWidth(w)
2734 , mPrecision(prec)
2735 , mTitle(tit)
2736 , mSuffix(suff)
2737 , mValue(val) {}
2738
title()2739 std::string MarkDownColumn::title() const {
2740 std::stringstream ss;
2741 ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2742 return ss.str();
2743 }
2744
separator()2745 std::string MarkDownColumn::separator() const {
2746 std::string sep(static_cast<size_t>(mWidth), '-');
2747 sep.front() = '|';
2748 sep.back() = ':';
2749 return sep;
2750 }
2751
invalid()2752 std::string MarkDownColumn::invalid() const {
2753 std::string sep(static_cast<size_t>(mWidth), ' ');
2754 sep.front() = '|';
2755 sep[sep.size() - 2] = '-';
2756 return sep;
2757 }
2758
value()2759 std::string MarkDownColumn::value() const {
2760 std::stringstream ss;
2761 auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2762 ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2763 return ss.str();
2764 }
2765
2766 // Formats any text as markdown code, escaping backticks.
MarkDownCode(std::string const & what)2767 MarkDownCode::MarkDownCode(std::string const& what) {
2768 mWhat.reserve(what.size() + 2);
2769 mWhat.push_back('`');
2770 for (char c : what) {
2771 mWhat.push_back(c);
2772 if ('`' == c) {
2773 mWhat.push_back('`');
2774 }
2775 }
2776 mWhat.push_back('`');
2777 }
2778
write(std::ostream & os)2779 std::ostream& MarkDownCode::write(std::ostream& os) const {
2780 return os << mWhat;
2781 }
2782
2783 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2784 return mdCode.write(os);
2785 }
2786 } // namespace fmt
2787 } // namespace detail
2788
2789 // provide implementation here so it's only generated once
2790 Config::Config() = default;
2791 Config::~Config() = default;
2792 Config& Config::operator=(Config const&) = default;
2793 Config& Config::operator=(Config&&) = default;
2794 Config::Config(Config const&) = default;
2795 Config::Config(Config&&) noexcept = default;
2796
2797 // provide implementation here so it's only generated once
2798 Result::~Result() = default;
2799 Result& Result::operator=(Result const&) = default;
2800 Result& Result::operator=(Result&&) = default;
2801 Result::Result(Result const&) = default;
2802 Result::Result(Result&&) noexcept = default;
2803
2804 namespace detail {
2805 template <typename T>
u(T val)2806 inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2807 return static_cast<typename std::underlying_type<T>::type>(val);
2808 }
2809 } // namespace detail
2810
2811 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
Result(Config const & benchmarkConfig)2812 Result::Result(Config const& benchmarkConfig)
2813 : mConfig(benchmarkConfig)
2814 , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2815
add(Clock::duration totalElapsed,uint64_t iters,detail::PerformanceCounters const & pc)2816 void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
2817 using detail::d;
2818 using detail::u;
2819
2820 double dIters = d(iters);
2821 mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2822
2823 mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2824 if (pc.has().pageFaults) {
2825 mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2826 }
2827 if (pc.has().cpuCycles) {
2828 mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2829 }
2830 if (pc.has().contextSwitches) {
2831 mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2832 }
2833 if (pc.has().instructions) {
2834 mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2835 }
2836 if (pc.has().branchInstructions) {
2837 double branchInstructions = 0.0;
2838 // correcting branches: remove branch introduced by the while (...) loop for each iteration.
2839 if (pc.val().branchInstructions > iters + 1U) {
2840 branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2841 }
2842 mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2843
2844 if (pc.has().branchMisses) {
2845 // correcting branch misses
2846 double branchMisses = d(pc.val().branchMisses);
2847 if (branchMisses > branchInstructions) {
2848 // can't have branch misses when there were branches...
2849 branchMisses = branchInstructions;
2850 }
2851
2852 // assuming at least one missed branch for the loop
2853 branchMisses -= 1.0;
2854 if (branchMisses < 1.0) {
2855 branchMisses = 1.0;
2856 }
2857 mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2858 }
2859 }
2860 }
2861
config()2862 Config const& Result::config() const noexcept {
2863 return mConfig;
2864 }
2865
calcMedian(std::vector<double> & data)2866 inline double calcMedian(std::vector<double>& data) {
2867 if (data.empty()) {
2868 return 0.0;
2869 }
2870 std::sort(data.begin(), data.end());
2871
2872 auto midIdx = data.size() / 2U;
2873 if (1U == (data.size() & 1U)) {
2874 return data[midIdx];
2875 }
2876 return (data[midIdx - 1U] + data[midIdx]) / 2U;
2877 }
2878
median(Measure m)2879 double Result::median(Measure m) const {
2880 // create a copy so we can sort
2881 auto data = mNameToMeasurements[detail::u(m)];
2882 return calcMedian(data);
2883 }
2884
average(Measure m)2885 double Result::average(Measure m) const {
2886 using detail::d;
2887 auto const& data = mNameToMeasurements[detail::u(m)];
2888 if (data.empty()) {
2889 return 0.0;
2890 }
2891
2892 // create a copy so we can sort
2893 return sum(m) / d(data.size());
2894 }
2895
medianAbsolutePercentError(Measure m)2896 double Result::medianAbsolutePercentError(Measure m) const {
2897 // create copy
2898 auto data = mNameToMeasurements[detail::u(m)];
2899
2900 // calculates MdAPE which is the median of percentage error
2901 // see https://www.spiderfinancial.com/support/documentation/numxl/reference-manual/forecasting-performance/mdape
2902 auto med = calcMedian(data);
2903
2904 // transform the data to absolute error
2905 for (auto& x : data) {
2906 x = (x - med) / x;
2907 if (x < 0) {
2908 x = -x;
2909 }
2910 }
2911 return calcMedian(data);
2912 }
2913
sum(Measure m)2914 double Result::sum(Measure m) const noexcept {
2915 auto const& data = mNameToMeasurements[detail::u(m)];
2916 return std::accumulate(data.begin(), data.end(), 0.0);
2917 }
2918
sumProduct(Measure m1,Measure m2)2919 double Result::sumProduct(Measure m1, Measure m2) const noexcept {
2920 auto const& data1 = mNameToMeasurements[detail::u(m1)];
2921 auto const& data2 = mNameToMeasurements[detail::u(m2)];
2922
2923 if (data1.size() != data2.size()) {
2924 return 0.0;
2925 }
2926
2927 double result = 0.0;
2928 for (size_t i = 0, s = data1.size(); i != s; ++i) {
2929 result += data1[i] * data2[i];
2930 }
2931 return result;
2932 }
2933
has(Measure m)2934 bool Result::has(Measure m) const noexcept {
2935 return !mNameToMeasurements[detail::u(m)].empty();
2936 }
2937
get(size_t idx,Measure m)2938 double Result::get(size_t idx, Measure m) const {
2939 auto const& data = mNameToMeasurements[detail::u(m)];
2940 return data.at(idx);
2941 }
2942
empty()2943 bool Result::empty() const noexcept {
2944 return 0U == size();
2945 }
2946
size()2947 size_t Result::size() const noexcept {
2948 auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
2949 return data.size();
2950 }
2951
minimum(Measure m)2952 double Result::minimum(Measure m) const noexcept {
2953 auto const& data = mNameToMeasurements[detail::u(m)];
2954 if (data.empty()) {
2955 return 0.0;
2956 }
2957
2958 // here its save to assume that at least one element is there
2959 return *std::min_element(data.begin(), data.end());
2960 }
2961
maximum(Measure m)2962 double Result::maximum(Measure m) const noexcept {
2963 auto const& data = mNameToMeasurements[detail::u(m)];
2964 if (data.empty()) {
2965 return 0.0;
2966 }
2967
2968 // here its save to assume that at least one element is there
2969 return *std::max_element(data.begin(), data.end());
2970 }
2971
fromString(std::string const & str)2972 Result::Measure Result::fromString(std::string const& str) {
2973 if (str == "elapsed") {
2974 return Measure::elapsed;
2975 } else if (str == "iterations") {
2976 return Measure::iterations;
2977 } else if (str == "pagefaults") {
2978 return Measure::pagefaults;
2979 } else if (str == "cpucycles") {
2980 return Measure::cpucycles;
2981 } else if (str == "contextswitches") {
2982 return Measure::contextswitches;
2983 } else if (str == "instructions") {
2984 return Measure::instructions;
2985 } else if (str == "branchinstructions") {
2986 return Measure::branchinstructions;
2987 } else if (str == "branchmisses") {
2988 return Measure::branchmisses;
2989 } else {
2990 // not found, return _size
2991 return Measure::_size;
2992 }
2993 }
2994
2995 // Configuration of a microbenchmark.
Bench()2996 Bench::Bench() {
2997 mConfig.mOut = &std::cout;
2998 }
2999
3000 Bench::Bench(Bench&&) = default;
3001 Bench& Bench::operator=(Bench&&) = default;
3002 Bench::Bench(Bench const&) = default;
3003 Bench& Bench::operator=(Bench const&) = default;
3004 Bench::~Bench() noexcept = default;
3005
batch()3006 double Bench::batch() const noexcept {
3007 return mConfig.mBatch;
3008 }
3009
complexityN()3010 double Bench::complexityN() const noexcept {
3011 return mConfig.mComplexityN;
3012 }
3013
3014 // Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
3015 // means it is slower than the baseline.
relative(bool isRelativeEnabled)3016 Bench& Bench::relative(bool isRelativeEnabled) noexcept {
3017 mConfig.mIsRelative = isRelativeEnabled;
3018 return *this;
3019 }
relative()3020 bool Bench::relative() const noexcept {
3021 return mConfig.mIsRelative;
3022 }
3023
performanceCounters(bool showPerformanceCounters)3024 Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
3025 mConfig.mShowPerformanceCounters = showPerformanceCounters;
3026 return *this;
3027 }
performanceCounters()3028 bool Bench::performanceCounters() const noexcept {
3029 return mConfig.mShowPerformanceCounters;
3030 }
3031
3032 // Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
3033 // If u differs from currently set unit, the stored results will be cleared.
3034 // Use singular (byte, not bytes).
unit(char const * u)3035 Bench& Bench::unit(char const* u) {
3036 if (u != mConfig.mUnit) {
3037 mResults.clear();
3038 }
3039 mConfig.mUnit = u;
3040 return *this;
3041 }
3042
unit(std::string const & u)3043 Bench& Bench::unit(std::string const& u) {
3044 return unit(u.c_str());
3045 }
3046
unit()3047 std::string const& Bench::unit() const noexcept {
3048 return mConfig.mUnit;
3049 }
3050
timeUnit(std::chrono::duration<double> const & tu,std::string const & tuName)3051 Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
3052 mConfig.mTimeUnit = tu;
3053 mConfig.mTimeUnitName = tuName;
3054 return *this;
3055 }
3056
timeUnitName()3057 std::string const& Bench::timeUnitName() const noexcept {
3058 return mConfig.mTimeUnitName;
3059 }
3060
timeUnit()3061 std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
3062 return mConfig.mTimeUnit;
3063 }
3064
3065 // If benchmarkTitle differs from currently set title, the stored results will be cleared.
title(const char * benchmarkTitle)3066 Bench& Bench::title(const char* benchmarkTitle) {
3067 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3068 mResults.clear();
3069 }
3070 mConfig.mBenchmarkTitle = benchmarkTitle;
3071 return *this;
3072 }
title(std::string const & benchmarkTitle)3073 Bench& Bench::title(std::string const& benchmarkTitle) {
3074 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3075 mResults.clear();
3076 }
3077 mConfig.mBenchmarkTitle = benchmarkTitle;
3078 return *this;
3079 }
3080
title()3081 std::string const& Bench::title() const noexcept {
3082 return mConfig.mBenchmarkTitle;
3083 }
3084
name(const char * benchmarkName)3085 Bench& Bench::name(const char* benchmarkName) {
3086 mConfig.mBenchmarkName = benchmarkName;
3087 return *this;
3088 }
3089
name(std::string const & benchmarkName)3090 Bench& Bench::name(std::string const& benchmarkName) {
3091 mConfig.mBenchmarkName = benchmarkName;
3092 return *this;
3093 }
3094
name()3095 std::string const& Bench::name() const noexcept {
3096 return mConfig.mBenchmarkName;
3097 }
3098
3099 // Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
epochs(size_t numEpochs)3100 Bench& Bench::epochs(size_t numEpochs) noexcept {
3101 mConfig.mNumEpochs = numEpochs;
3102 return *this;
3103 }
epochs()3104 size_t Bench::epochs() const noexcept {
3105 return mConfig.mNumEpochs;
3106 }
3107
3108 // Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
clockResolutionMultiple(size_t multiple)3109 Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3110 mConfig.mClockResolutionMultiple = multiple;
3111 return *this;
3112 }
clockResolutionMultiple()3113 size_t Bench::clockResolutionMultiple() const noexcept {
3114 return mConfig.mClockResolutionMultiple;
3115 }
3116
3117 // Sets the maximum time each epoch should take. Default is 100ms.
maxEpochTime(std::chrono::nanoseconds t)3118 Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3119 mConfig.mMaxEpochTime = t;
3120 return *this;
3121 }
maxEpochTime()3122 std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3123 return mConfig.mMaxEpochTime;
3124 }
3125
3126 // Sets the maximum time each epoch should take. Default is 100ms.
minEpochTime(std::chrono::nanoseconds t)3127 Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3128 mConfig.mMinEpochTime = t;
3129 return *this;
3130 }
minEpochTime()3131 std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3132 return mConfig.mMinEpochTime;
3133 }
3134
minEpochIterations(uint64_t numIters)3135 Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3136 mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3137 return *this;
3138 }
minEpochIterations()3139 uint64_t Bench::minEpochIterations() const noexcept {
3140 return mConfig.mMinEpochIterations;
3141 }
3142
epochIterations(uint64_t numIters)3143 Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3144 mConfig.mEpochIterations = numIters;
3145 return *this;
3146 }
epochIterations()3147 uint64_t Bench::epochIterations() const noexcept {
3148 return mConfig.mEpochIterations;
3149 }
3150
warmup(uint64_t numWarmupIters)3151 Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3152 mConfig.mWarmup = numWarmupIters;
3153 return *this;
3154 }
warmup()3155 uint64_t Bench::warmup() const noexcept {
3156 return mConfig.mWarmup;
3157 }
3158
config(Config const & benchmarkConfig)3159 Bench& Bench::config(Config const& benchmarkConfig) {
3160 mConfig = benchmarkConfig;
3161 return *this;
3162 }
config()3163 Config const& Bench::config() const noexcept {
3164 return mConfig;
3165 }
3166
output(std::ostream * outstream)3167 Bench& Bench::output(std::ostream* outstream) noexcept {
3168 mConfig.mOut = outstream;
3169 return *this;
3170 }
3171
ANKERL_NANOBENCH(NODISCARD)3172 ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3173 return mConfig.mOut;
3174 }
3175
results()3176 std::vector<Result> const& Bench::results() const noexcept {
3177 return mResults;
3178 }
3179
render(char const * templateContent,std::ostream & os)3180 Bench& Bench::render(char const* templateContent, std::ostream& os) {
3181 ::ankerl::nanobench::render(templateContent, *this, os);
3182 return *this;
3183 }
3184
render(std::string const & templateContent,std::ostream & os)3185 Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
3186 ::ankerl::nanobench::render(templateContent, *this, os);
3187 return *this;
3188 }
3189
complexityBigO()3190 std::vector<BigO> Bench::complexityBigO() const {
3191 std::vector<BigO> bigOs;
3192 auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3193 bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3194 return 1.0;
3195 });
3196 bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3197 return n;
3198 });
3199 bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3200 return std::log2(n);
3201 });
3202 bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3203 return n * std::log2(n);
3204 });
3205 bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3206 return n * n;
3207 });
3208 bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3209 return n * n * n;
3210 });
3211 std::sort(bigOs.begin(), bigOs.end());
3212 return bigOs;
3213 }
3214
Rng()3215 Rng::Rng()
3216 : mX(0)
3217 , mY(0) {
3218 std::random_device rd;
3219 std::uniform_int_distribution<uint64_t> dist;
3220 do {
3221 mX = dist(rd);
3222 mY = dist(rd);
3223 } while (mX == 0 && mY == 0);
3224 }
3225
3226 ANKERL_NANOBENCH_NO_SANITIZE("integer")
splitMix64(uint64_t & state)3227 uint64_t splitMix64(uint64_t& state) noexcept {
3228 uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3229 z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3230 z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3231 return z ^ (z >> 31U);
3232 }
3233
3234 // Seeded as described in romu paper (update april 2020)
Rng(uint64_t seed)3235 Rng::Rng(uint64_t seed) noexcept
3236 : mX(splitMix64(seed))
3237 , mY(splitMix64(seed)) {
3238 for (size_t i = 0; i < 10; ++i) {
3239 operator()();
3240 }
3241 }
3242
3243 // only internally used to copy the RNG.
Rng(uint64_t x,uint64_t y)3244 Rng::Rng(uint64_t x, uint64_t y) noexcept
3245 : mX(x)
3246 , mY(y) {}
3247
copy()3248 Rng Rng::copy() const noexcept {
3249 return Rng{mX, mY};
3250 }
3251
collectRangeMeasure(std::vector<Result> const & results)3252 BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3253 BigO::RangeMeasure rangeMeasure;
3254 for (auto const& result : results) {
3255 if (result.config().mComplexityN > 0.0) {
3256 rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3257 }
3258 }
3259 return rangeMeasure;
3260 }
3261
BigO(std::string const & bigOName,RangeMeasure const & rangeMeasure)3262 BigO::BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure)
3263 : mName(bigOName) {
3264
3265 // estimate the constant factor
3266 double sumRangeMeasure = 0.0;
3267 double sumRangeRange = 0.0;
3268
3269 for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3270 sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second;
3271 sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first;
3272 }
3273 mConstant = sumRangeMeasure / sumRangeRange;
3274
3275 // calculate root mean square
3276 double err = 0.0;
3277 double sumMeasure = 0.0;
3278 for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3279 auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second;
3280 err += diff * diff;
3281
3282 sumMeasure += rangeMeasure[i].second;
3283 }
3284
3285 auto n = static_cast<double>(rangeMeasure.size());
3286 auto mean = sumMeasure / n;
3287 mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3288 }
3289
BigO(const char * bigOName,RangeMeasure const & rangeMeasure)3290 BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3291 : BigO(std::string(bigOName), rangeMeasure) {}
3292
name()3293 std::string const& BigO::name() const noexcept {
3294 return mName;
3295 }
3296
constant()3297 double BigO::constant() const noexcept {
3298 return mConstant;
3299 }
3300
normalizedRootMeanSquare()3301 double BigO::normalizedRootMeanSquare() const noexcept {
3302 return mNormalizedRootMeanSquare;
3303 }
3304
3305 bool BigO::operator<(BigO const& other) const noexcept {
3306 return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3307 }
3308
3309 std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3310 return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3311 }
3312
3313 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3314 detail::fmt::StreamStateRestorer restorer(os);
3315 os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3316 for (auto const& bigO : bigOs) {
3317 os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3318 os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3319 os << "| " << bigO.name();
3320 os << std::endl;
3321 }
3322 return os;
3323 }
3324
3325 } // namespace nanobench
3326 } // namespace ankerl
3327
3328 #endif // ANKERL_NANOBENCH_IMPLEMENT
3329 #endif // ANKERL_NANOBENCH_H_INCLUDED
3330