Bitcoin ABC 0.30.5
P2P Digital Currency
nanobench.h
Go to the documentation of this file.
1// __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _
2// | \ | |_____| | \ | | | |_____] |______ | \ | | |_____|
3// | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | |
4//
5// Microbenchmark framework for C++11/14/17/20
6// https://github.com/martinus/nanobench
7//
8// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9// SPDX-License-Identifier: MIT
10// Copyright (c) 2019-2021 Martin Ankerl <[email protected]>
11//
12// Permission is hereby granted, free of charge, to any person obtaining a copy
13// of this software and associated documentation files (the "Software"), to deal
14// in the Software without restriction, including without limitation the rights
15// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16// copies of the Software, and to permit persons to whom the Software is
17// furnished to do so, subject to the following conditions:
18//
19// The above copyright notice and this permission notice shall be included in all
20// copies or substantial portions of the Software.
21//
22// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28// SOFTWARE.
29
30#ifndef ANKERL_NANOBENCH_H_INCLUDED
31#define ANKERL_NANOBENCH_H_INCLUDED
32
33// see https://semver.org/
34#define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35#define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes
36#define ANKERL_NANOBENCH_VERSION_PATCH 6 // backwards-compatible bug fixes
37
39// public facing api - as minimal as possible
41
42#include <chrono> // high_resolution_clock
43#include <cstring> // memcpy
44#include <iosfwd> // for std::ostream* custom output target in Config
45#include <string> // all names
46#include <vector> // holds all results
47
48#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
49
50#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
51#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
52#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
53#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
54#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
55
56#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
57# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
58#else
59# define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
60#endif
61
62#if defined(__clang__)
63# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
64 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
65# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
66#else
67# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
68# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
69#endif
70
71#if defined(__GNUC__)
72# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
73# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
74#else
75# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
76# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
77#endif
78
79#if defined(ANKERL_NANOBENCH_LOG_ENABLED)
80# include <iostream>
81# define ANKERL_NANOBENCH_LOG(x) \
82 do { \
83 std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
84 } while (0)
85#else
86# define ANKERL_NANOBENCH_LOG(x) \
87 do { \
88 } while (0)
89#endif
90
91#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
92#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
93# include <linux/version.h>
94# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0)
95// PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3
96// PERF_FLAG_FD_CLOEXEC since kernel 3.14
97# undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
98# define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
99# endif
100#endif
101
102#if defined(__clang__)
103# define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
104#else
105# define ANKERL_NANOBENCH_NO_SANITIZE(...)
106#endif
107
108#if defined(_MSC_VER)
109# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
110#else
111# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
112#endif
113
114// workaround missing "is_trivially_copyable" in g++ < 5.0
115// See https://stackoverflow.com/a/31798726/48181
116#if defined(__GNUC__) && __GNUC__ < 5
117# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
118#else
119# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
120#endif
121
122// declarations ///////////////////////////////////////////////////////////////////////////////////
123
124namespace ankerl {
125namespace nanobench {
126
127using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
128 std::chrono::steady_clock>::type;
129class Bench;
130struct Config;
131class Result;
132class Rng;
133class BigO;
134
283void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
284void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
285
294void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
295void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
296
297// Contains mustache-like templates
298namespace templates {
299
309char const* csv() noexcept;
310
321char const* htmlBoxplot() noexcept;
322
329char const* pyperf() noexcept;
330
340char const* json() noexcept;
341
342} // namespace templates
343
344namespace detail {
345
346template <typename T>
347struct PerfCountSet;
348
349class IterationLogic;
351
352#if ANKERL_NANOBENCH(PERF_COUNTERS)
353class LinuxPerformanceCounters;
354#endif
355
356} // namespace detail
357} // namespace nanobench
358} // namespace ankerl
359
360// definitions ////////////////////////////////////////////////////////////////////////////////////
361
362namespace ankerl {
363namespace nanobench {
364namespace detail {
365
366template <typename T>
374};
375
376} // namespace detail
377
378ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
379struct Config {
380 // actual benchmark config
381 std::string mBenchmarkTitle = "benchmark";
382 std::string mBenchmarkName = "noname";
383 std::string mUnit = "op";
384 double mBatch = 1.0;
385 double mComplexityN = -1.0;
386 size_t mNumEpochs = 11;
387 size_t mClockResolutionMultiple = static_cast<size_t>(1000);
388 std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
389 std::chrono::nanoseconds mMinEpochTime{};
390 uint64_t mMinEpochIterations{1};
391 uint64_t mEpochIterations{0}; // If not 0, run *exactly* these number of iterations per epoch.
392 uint64_t mWarmup = 0;
393 std::ostream* mOut = nullptr;
394 std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1};
395 std::string mTimeUnitName = "ns";
396 bool mShowPerformanceCounters = true;
397 bool mIsRelative = false;
398
403 Config(Config const&);
404 Config(Config&&) noexcept;
405};
406ANKERL_NANOBENCH(IGNORE_PADDED_POP)
407
408// Result returned after a benchmark has finished. Can be used as a baseline for relative().
409ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
410class Result {
411public:
412 enum class Measure : size_t {
413 elapsed,
414 iterations,
415 pagefaults,
416 cpucycles,
417 contextswitches,
418 instructions,
419 branchinstructions,
420 branchmisses,
421 _size
422 };
423
424 explicit Result(Config const& benchmarkConfig);
425
429 Result(Result const&);
430 Result(Result&&) noexcept;
431
432 // adds new measurement results
433 // all values are scaled by iters (except iters...)
434 void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
435
436 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
437
438 ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
439 ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
440 ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
441 ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
442 ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
443 ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
444 ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
445
446 ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
447 ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
448 ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
449 ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
450
451 // Finds string, if not found, returns _size.
452 static Measure fromString(std::string const& str);
453
454private:
455 Config mConfig{};
456 std::vector<std::vector<double>> mNameToMeasurements{};
457};
458ANKERL_NANOBENCH(IGNORE_PADDED_POP)
459
460
477class Rng final {
478public:
482 using result_type = uint64_t;
483
484 static constexpr uint64_t(min)();
485 static constexpr uint64_t(max)();
486
492 Rng(Rng const&) = delete;
493
497 Rng& operator=(Rng const&) = delete;
498
499 // moving is ok
500 Rng(Rng&&) noexcept = default;
501 Rng& operator=(Rng&&) noexcept = default;
502 ~Rng() noexcept = default;
503
512
529 explicit Rng(uint64_t seed) noexcept;
530 Rng(uint64_t x, uint64_t y) noexcept;
531 Rng(std::vector<uint64_t> const& data);
532
536 ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
537
545 inline uint64_t operator()() noexcept;
546
547 // This is slightly biased. See
548
563 inline uint32_t bounded(uint32_t range) noexcept;
564
565 // random double in range [0, 1(
566 // see http://prng.di.unimi.it/
567
574 inline double uniform01() noexcept;
575
583 template <typename Container>
584 void shuffle(Container& container) noexcept;
585
592 std::vector<uint64_t> state() const;
593
594private:
595 static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
596
597 uint64_t mX;
598 uint64_t mY;
599};
600
615ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
616class Bench {
617public:
622
623 Bench(Bench&& other);
625 Bench(Bench const& other);
626 Bench& operator=(Bench const& other);
627 ~Bench() noexcept;
628
647 template <typename Op>
648 ANKERL_NANOBENCH(NOINLINE)
649 Bench& run(char const* benchmarkName, Op&& op);
650
651 template <typename Op>
652 ANKERL_NANOBENCH(NOINLINE)
653 Bench& run(std::string const& benchmarkName, Op&& op);
654
659 template <typename Op>
660 ANKERL_NANOBENCH(NOINLINE)
661 Bench& run(Op&& op);
662
668 Bench& title(char const* benchmarkTitle);
669 Bench& title(std::string const& benchmarkTitle);
670 ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
671
673 Bench& name(char const* benchmarkName);
674 Bench& name(std::string const& benchmarkName);
675 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
676
686 template <typename T>
687 Bench& batch(T b) noexcept;
688 ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
689
698 Bench& unit(char const* unit);
699 Bench& unit(std::string const& unit);
700 ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
701
711 Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
712 ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
713 ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
714
722 Bench& output(std::ostream* outstream) noexcept;
723 ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
724
745 Bench& clockResolutionMultiple(size_t multiple) noexcept;
746 ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
747
763 Bench& epochs(size_t numEpochs) noexcept;
764 ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
765
776 Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
777 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
778
789 Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
790 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
791
802 Bench& minEpochIterations(uint64_t numIters) noexcept;
803 ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
804
811 Bench& epochIterations(uint64_t numIters) noexcept;
812 ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
813
823 Bench& warmup(uint64_t numWarmupIters) noexcept;
824 ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
825
843 Bench& relative(bool isRelativeEnabled) noexcept;
844 ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
845
854 Bench& performanceCounters(bool showPerformanceCounters) noexcept;
855 ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
856
865 ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
866
874 template <typename Arg>
876
891 template <typename T>
892 Bench& complexityN(T b) noexcept;
893 ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
894
926 std::vector<BigO> complexityBigO() const;
927
951 template <typename Op>
952 BigO complexityBigO(char const* name, Op op) const;
953
954 template <typename Op>
955 BigO complexityBigO(std::string const& name, Op op) const;
956
964 Bench& render(char const* templateContent, std::ostream& os);
965 Bench& render(std::string const& templateContent, std::ostream& os);
966
967 Bench& config(Config const& benchmarkConfig);
968 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
969
970private:
971 Config mConfig{};
972 std::vector<Result> mResults{};
973};
974ANKERL_NANOBENCH(IGNORE_PADDED_POP)
975
976
982template <typename Arg>
983void doNotOptimizeAway(Arg&& arg);
984
985namespace detail {
986
987#if defined(_MSC_VER)
988void doNotOptimizeAwaySink(void const*);
989
990template <typename T>
991void doNotOptimizeAway(T const& val);
992
993#else
994
995// These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
996// this seemd to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
997// see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
998template <typename T>
999void doNotOptimizeAway(T const& val) {
1000 // NOLINTNEXTLINE(hicpp-no-assembler)
1001 asm volatile("" : : "r,m"(val) : "memory");
1002}
1003
1004template <typename T>
1005void doNotOptimizeAway(T& val) {
1006# if defined(__clang__)
1007 // NOLINTNEXTLINE(hicpp-no-assembler)
1008 asm volatile("" : "+r,m"(val) : : "memory");
1009# else
1010 // NOLINTNEXTLINE(hicpp-no-assembler)
1011 asm volatile("" : "+m,r"(val) : : "memory");
1012# endif
1013}
1014#endif
1015
1016// internally used, but visible because run() is templated.
1017// Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
1018// having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
1019ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
1021public:
1022 explicit IterationLogic(Bench const& config) noexcept;
1024
1025 ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
1026 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
1027 void moveResultTo(std::vector<Result>& results) noexcept;
1028
1029private:
1030 struct Impl;
1031 Impl* mPimpl;
1032};
1033ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
1034
1035ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1037public:
1040
1043
1046 void updateResults(uint64_t numIters);
1047
1048 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1049 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1050
1051private:
1052#if ANKERL_NANOBENCH(PERF_COUNTERS)
1053 LinuxPerformanceCounters* mPc = nullptr;
1054#endif
1057};
1058ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1059
1060// Gets the singleton
1062
1063} // namespace detail
1064
1065class BigO {
1066public:
1067 using RangeMeasure = std::vector<std::pair<double, double>>;
1068
1069 template <typename Op>
1071 for (auto& rangeMeasure : data) {
1072 rangeMeasure.first = op(rangeMeasure.first);
1073 }
1074 return data;
1075 }
1076
1077 static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1078
1079 template <typename Op>
1080 BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1081 : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1082
1083 template <typename Op>
1084 BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1085 : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1086
1087 BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1088 BigO(std::string const& bigOName, RangeMeasure const& scaledRangeMeasure);
1089 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1090 ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1091 ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1092 ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1093
1094private:
1095 std::string mName{};
1096 double mConstant{};
1097 double mNormalizedRootMeanSquare{};
1098};
1099std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1100std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1101
1102} // namespace nanobench
1103} // namespace ankerl
1104
1105// implementation /////////////////////////////////////////////////////////////////////////////////
1106
1107namespace ankerl {
1108namespace nanobench {
1109
1110constexpr uint64_t(Rng::min)() {
1111 return 0;
1112}
1113
1114constexpr uint64_t(Rng::max)() {
1115 return (std::numeric_limits<uint64_t>::max)();
1116}
1117
1118ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1119uint64_t Rng::operator()() noexcept {
1120 auto x = mX;
1121
1122 mX = UINT64_C(15241094284759029579) * mY;
1123 mY = rotl(mY - x, 27);
1124
1125 return x;
1126}
1127
1128ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1129uint32_t Rng::bounded(uint32_t range) noexcept {
1130 uint64_t r32 = static_cast<uint32_t>(operator()());
1131 auto multiresult = r32 * range;
1132 return static_cast<uint32_t>(multiresult >> 32U);
1133}
1134
1135double Rng::uniform01() noexcept {
1136 auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1137 // can't use union in c++ here for type puning, it's undefined behavior.
1138 // std::memcpy is optimized anyways.
1139 double d;
1140 std::memcpy(&d, &i, sizeof(double));
1141 return d - 1.0;
1142}
1143
1144template <typename Container>
1145void Rng::shuffle(Container& container) noexcept {
1146 auto size = static_cast<uint32_t>(container.size());
1147 for (auto i = size; i > 1U; --i) {
1148 using std::swap;
1149 auto p = bounded(i); // number in [0, i)
1150 swap(container[i - 1], container[p]);
1151 }
1152}
1153
1154ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1155constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1156 return (x << k) | (x >> (64U - k));
1157}
1158
1159template <typename Op>
1161Bench& Bench::run(Op&& op) {
1162 // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1163 detail::IterationLogic iterationLogic(*this);
1164 auto& pc = detail::performanceCounters();
1165
1166 while (auto n = iterationLogic.numIters()) {
1167 pc.beginMeasure();
1168 Clock::time_point before = Clock::now();
1169 while (n-- > 0) {
1170 op();
1171 }
1172 Clock::time_point after = Clock::now();
1173 pc.endMeasure();
1174 pc.updateResults(iterationLogic.numIters());
1175 iterationLogic.add(after - before, pc);
1176 }
1177 iterationLogic.moveResultTo(mResults);
1178 return *this;
1179}
1180
1181// Performs all evaluations.
1182template <typename Op>
1183Bench& Bench::run(char const* benchmarkName, Op&& op) {
1184 name(benchmarkName);
1185 return run(std::forward<Op>(op));
1186}
1187
1188template <typename Op>
1189Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1190 name(benchmarkName);
1191 return run(std::forward<Op>(op));
1192}
1193
1194template <typename Op>
1195BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1196 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1197}
1198
1199template <typename Op>
1200BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1201 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1202}
1203
1204// Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1205// Any argument is cast to double.
1206template <typename T>
1207Bench& Bench::batch(T b) noexcept {
1208 mConfig.mBatch = static_cast<double>(b);
1209 return *this;
1210}
1211
1212// Sets the computation complexity of the next run. Any argument is cast to double.
1213template <typename T>
1215 mConfig.mComplexityN = static_cast<double>(n);
1216 return *this;
1217}
1218
1219// Convenience: makes sure none of the given arguments are optimized away by the compiler.
1220template <typename Arg>
1222 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1223 return *this;
1224}
1225
1226// Makes sure none of the given arguments are optimized away by the compiler.
1227template <typename Arg>
1228void doNotOptimizeAway(Arg&& arg) {
1229 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1230}
1231
1232namespace detail {
1233
1234#if defined(_MSC_VER)
1235template <typename T>
1236void doNotOptimizeAway(T const& val) {
1237 doNotOptimizeAwaySink(&val);
1238}
1239
1240#endif
1241
1242} // namespace detail
1243} // namespace nanobench
1244} // namespace ankerl
1245
1246#if defined(ANKERL_NANOBENCH_IMPLEMENT)
1247
1249// implementation part - only visible in .cpp
1251
1252# include <algorithm> // sort, reverse
1253# include <atomic> // compare_exchange_strong in loop overhead
1254# include <cstdlib> // getenv
1255# include <cstring> // strstr, strncmp
1256# include <fstream> // ifstream to parse proc files
1257# include <iomanip> // setw, setprecision
1258# include <iostream> // cout
1259# include <numeric> // accumulate
1260# include <random> // random_device
1261# include <sstream> // to_s in Number
1262# include <stdexcept> // throw for rendering templates
1263# include <tuple> // std::tie
1264# if defined(__linux__)
1265# include <unistd.h> //sysconf
1266# endif
1267# if ANKERL_NANOBENCH(PERF_COUNTERS)
1268# include <map> // map
1269
1270# include <linux/perf_event.h>
1271# include <sys/ioctl.h>
1272# include <sys/syscall.h>
1273# include <unistd.h>
1274# endif
1275
1276// declarations ///////////////////////////////////////////////////////////////////////////////////
1277
1278namespace ankerl {
1279namespace nanobench {
1280
1281// helper stuff that is only intended to be used internally
1282namespace detail {
1283
1284struct TableInfo;
1285
1286// formatting utilities
1287namespace fmt {
1288
1289class NumSep;
1290class StreamStateRestorer;
1291class Number;
1292class MarkDownColumn;
1293class MarkDownCode;
1294
1295} // namespace fmt
1296} // namespace detail
1297} // namespace nanobench
1298} // namespace ankerl
1299
1300// definitions ////////////////////////////////////////////////////////////////////////////////////
1301
1302namespace ankerl {
1303namespace nanobench {
1304
1305uint64_t splitMix64(uint64_t& state) noexcept;
1306
1307namespace detail {
1308
1309// helpers to get double values
1310template <typename T>
1311inline double d(T t) noexcept {
1312 return static_cast<double>(t);
1313}
1314inline double d(Clock::duration duration) noexcept {
1315 return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1316}
1317
1318// Calculates clock resolution once, and remembers the result
1319inline Clock::duration clockResolution() noexcept;
1320
1321} // namespace detail
1322
1323namespace templates {
1324
1325char const* csv() noexcept {
1326 return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1327{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1328{{/result}})DELIM";
1329}
1330
1331char const* htmlBoxplot() noexcept {
1332 return R"DELIM(<html>
1333
1334<head>
1335 <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1336</head>
1337
1338<body>
1339 <div id="myDiv"></div>
1340 <script>
1341 var data = [
1342 {{#result}}{
1343 name: '{{name}}',
1344 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1345 },
1346 {{/result}}
1347 ];
1348 var title = '{{title}}';
1349
1350 data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1351 var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1352 </script>
1353</body>
1354
1355</html>)DELIM";
1356}
1357
1358char const* pyperf() noexcept {
1359 return R"DELIM({
1360 "benchmarks": [
1361 {
1362 "runs": [
1363 {
1364 "values": [
1365{{#measurement}} {{elapsed}}{{^-last}},
1366{{/last}}{{/measurement}}
1367 ]
1368 }
1369 ]
1370 }
1371 ],
1372 "metadata": {
1373 "loops": {{sum(iterations)}},
1374 "inner_loops": {{batch}},
1375 "name": "{{title}}",
1376 "unit": "second"
1377 },
1378 "version": "1.0"
1379})DELIM";
1380}
1381
1382char const* json() noexcept {
1383 return R"DELIM({
1384 "results": [
1385{{#result}} {
1386 "title": "{{title}}",
1387 "name": "{{name}}",
1388 "unit": "{{unit}}",
1389 "batch": {{batch}},
1390 "complexityN": {{complexityN}},
1391 "epochs": {{epochs}},
1392 "clockResolution": {{clockResolution}},
1393 "clockResolutionMultiple": {{clockResolutionMultiple}},
1394 "maxEpochTime": {{maxEpochTime}},
1395 "minEpochTime": {{minEpochTime}},
1396 "minEpochIterations": {{minEpochIterations}},
1397 "epochIterations": {{epochIterations}},
1398 "warmup": {{warmup}},
1399 "relative": {{relative}},
1400 "median(elapsed)": {{median(elapsed)}},
1401 "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1402 "median(instructions)": {{median(instructions)}},
1403 "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1404 "median(cpucycles)": {{median(cpucycles)}},
1405 "median(contextswitches)": {{median(contextswitches)}},
1406 "median(pagefaults)": {{median(pagefaults)}},
1407 "median(branchinstructions)": {{median(branchinstructions)}},
1408 "median(branchmisses)": {{median(branchmisses)}},
1409 "totalTime": {{sumProduct(iterations, elapsed)}},
1410 "measurements": [
1411{{#measurement}} {
1412 "iterations": {{iterations}},
1413 "elapsed": {{elapsed}},
1414 "pagefaults": {{pagefaults}},
1415 "cpucycles": {{cpucycles}},
1416 "contextswitches": {{contextswitches}},
1417 "instructions": {{instructions}},
1418 "branchinstructions": {{branchinstructions}},
1419 "branchmisses": {{branchmisses}}
1420 }{{^-last}},{{/-last}}
1421{{/measurement}} ]
1422 }{{^-last}},{{/-last}}
1423{{/result}} ]
1424})DELIM";
1425}
1426
1427ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1428struct Node {
1429 enum class Type { tag, content, section, inverted_section };
1430
1431 char const* begin;
1432 char const* end;
1433 std::vector<Node> children;
1434 Type type;
1435
1436 template <size_t N>
1437 // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1438 bool operator==(char const (&str)[N]) const noexcept {
1439 return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1440 }
1441};
1442ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1443
1444static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1445 std::vector<Node> nodes;
1446
1447 while (true) {
1448 auto begin = std::strstr(*tpl, "{{");
1449 auto end = begin;
1450 if (begin != nullptr) {
1451 begin += 2;
1452 end = std::strstr(begin, "}}");
1453 }
1454
1455 if (begin == nullptr || end == nullptr) {
1456 // nothing found, finish node
1457 nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1458 return nodes;
1459 }
1460
1461 nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1462
1463 // we found a tag
1464 *tpl = end + 2;
1465 switch (*begin) {
1466 case '/':
1467 // finished! bail out
1468 return nodes;
1469
1470 case '#':
1471 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1472 break;
1473
1474 case '^':
1475 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1476 break;
1477
1478 default:
1479 nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1480 break;
1481 }
1482 }
1483}
1484
1485static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1486 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1487 bool matchFirst = n == "-first";
1488 bool matchLast = n == "-last";
1489 if (!matchFirst && !matchLast) {
1490 return false;
1491 }
1492
1493 bool doWrite = false;
1494 if (n.type == Node::Type::section) {
1495 doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1496 } else if (n.type == Node::Type::inverted_section) {
1497 doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1498 }
1499
1500 if (doWrite) {
1501 for (auto const& child : n.children) {
1502 if (child.type == Node::Type::content) {
1503 out.write(child.begin, std::distance(child.begin, child.end));
1504 }
1505 }
1506 }
1507 return true;
1508}
1509
1510static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1511 matchResult.clear();
1512 auto idxOpen = str.find('(');
1513 auto idxClose = str.find(')', idxOpen);
1514 if (idxClose == std::string::npos) {
1515 return false;
1516 }
1517
1518 matchResult.emplace_back(str.substr(0, idxOpen));
1519
1520 // split by comma
1521 matchResult.emplace_back(std::string{});
1522 for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1523 if (str[i] == ' ' || str[i] == '\t') {
1524 // skip whitespace
1525 continue;
1526 }
1527 if (str[i] == ',') {
1528 // got a comma => new string
1529 matchResult.emplace_back(std::string{});
1530 continue;
1531 }
1532 // no whitespace no comma, append
1533 matchResult.back() += str[i];
1534 }
1535 return true;
1536}
1537
1538static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1539 using detail::d;
1540
1541 if (n == "title") {
1542 out << config.mBenchmarkTitle;
1543 return true;
1544 } else if (n == "name") {
1545 out << config.mBenchmarkName;
1546 return true;
1547 } else if (n == "unit") {
1548 out << config.mUnit;
1549 return true;
1550 } else if (n == "batch") {
1551 out << config.mBatch;
1552 return true;
1553 } else if (n == "complexityN") {
1554 out << config.mComplexityN;
1555 return true;
1556 } else if (n == "epochs") {
1557 out << config.mNumEpochs;
1558 return true;
1559 } else if (n == "clockResolution") {
1560 out << d(detail::clockResolution());
1561 return true;
1562 } else if (n == "clockResolutionMultiple") {
1563 out << config.mClockResolutionMultiple;
1564 return true;
1565 } else if (n == "maxEpochTime") {
1566 out << d(config.mMaxEpochTime);
1567 return true;
1568 } else if (n == "minEpochTime") {
1569 out << d(config.mMinEpochTime);
1570 return true;
1571 } else if (n == "minEpochIterations") {
1572 out << config.mMinEpochIterations;
1573 return true;
1574 } else if (n == "epochIterations") {
1575 out << config.mEpochIterations;
1576 return true;
1577 } else if (n == "warmup") {
1578 out << config.mWarmup;
1579 return true;
1580 } else if (n == "relative") {
1581 out << config.mIsRelative;
1582 return true;
1583 }
1584 return false;
1585}
1586
1587static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1588 if (generateConfigTag(n, r.config(), out)) {
1589 return out;
1590 }
1591 // match e.g. "median(elapsed)"
1592 // g++ 4.8 doesn't implement std::regex :(
1593 // static std::regex const regOpArg1("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\‍)$");
1594 // std::cmatch matchResult;
1595 // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1596 std::vector<std::string> matchResult;
1597 if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1598 if (matchResult.size() == 2) {
1599 auto m = Result::fromString(matchResult[1]);
1600 if (m == Result::Measure::_size) {
1601 return out << 0.0;
1602 }
1603
1604 if (matchResult[0] == "median") {
1605 return out << r.median(m);
1606 }
1607 if (matchResult[0] == "average") {
1608 return out << r.average(m);
1609 }
1610 if (matchResult[0] == "medianAbsolutePercentError") {
1611 return out << r.medianAbsolutePercentError(m);
1612 }
1613 if (matchResult[0] == "sum") {
1614 return out << r.sum(m);
1615 }
1616 if (matchResult[0] == "minimum") {
1617 return out << r.minimum(m);
1618 }
1619 if (matchResult[0] == "maximum") {
1620 return out << r.maximum(m);
1621 }
1622 } else if (matchResult.size() == 3) {
1623 auto m1 = Result::fromString(matchResult[1]);
1624 auto m2 = Result::fromString(matchResult[2]);
1626 return out << 0.0;
1627 }
1628
1629 if (matchResult[0] == "sumProduct") {
1630 return out << r.sumProduct(m1, m2);
1631 }
1632 }
1633 }
1634
1635 // match e.g. "sumProduct(elapsed, iterations)"
1636 // static std::regex const regOpArg2("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\‍)$");
1637
1638 // nothing matches :(
1639 throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1640}
1641
1642static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1643 for (auto const& n : nodes) {
1644 if (!generateFirstLast(n, idx, r.size(), out)) {
1645 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1646 switch (n.type) {
1647 case Node::Type::content:
1648 out.write(n.begin, std::distance(n.begin, n.end));
1649 break;
1650
1651 case Node::Type::inverted_section:
1652 throw std::runtime_error("got a inverted section inside measurement");
1653
1654 case Node::Type::section:
1655 throw std::runtime_error("got a section inside measurement");
1656
1657 case Node::Type::tag: {
1658 auto m = Result::fromString(std::string(n.begin, n.end));
1659 if (m == Result::Measure::_size || !r.has(m)) {
1660 out << 0.0;
1661 } else {
1662 out << r.get(idx, m);
1663 }
1664 break;
1665 }
1666 }
1667 }
1668 }
1669}
1670
1671static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1672 auto const& r = results[idx];
1673 for (auto const& n : nodes) {
1674 if (!generateFirstLast(n, idx, results.size(), out)) {
1675 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1676 switch (n.type) {
1677 case Node::Type::content:
1678 out.write(n.begin, std::distance(n.begin, n.end));
1679 break;
1680
1681 case Node::Type::inverted_section:
1682 throw std::runtime_error("got a inverted section inside result");
1683
1684 case Node::Type::section:
1685 if (n == "measurement") {
1686 for (size_t i = 0; i < r.size(); ++i) {
1687 generateResultMeasurement(n.children, i, r, out);
1688 }
1689 } else {
1690 throw std::runtime_error("got a section inside result");
1691 }
1692 break;
1693
1694 case Node::Type::tag:
1695 generateResultTag(n, r, out);
1696 break;
1697 }
1698 }
1699 }
1700}
1701
1702} // namespace templates
1703
1704// helper stuff that only intended to be used internally
1705namespace detail {
1706
1707char const* getEnv(char const* name);
1708bool isEndlessRunning(std::string const& name);
1709bool isWarningsEnabled();
1710
1711template <typename T>
1712T parseFile(std::string const& filename);
1713
1714void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1715void printStabilityInformationOnce(std::ostream* os);
1716
1717// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1718uint64_t& singletonHeaderHash() noexcept;
1719
1720// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1721Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1722
1723// formatting utilities
1724namespace fmt {
1725
1726// adds thousands separator to numbers
1727ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1728class NumSep : public std::numpunct<char> {
1729public:
1730 explicit NumSep(char sep);
1731 char do_thousands_sep() const override;
1732 std::string do_grouping() const override;
1733
1734private:
1735 char mSep;
1736};
1737ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1738
1739// RAII to save & restore a stream's state
1740ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1741class StreamStateRestorer {
1742public:
1743 explicit StreamStateRestorer(std::ostream& s);
1744 ~StreamStateRestorer();
1745
1746 // sets back all stream info that we remembered at construction
1747 void restore();
1748
1749 // don't allow copying / moving
1750 StreamStateRestorer(StreamStateRestorer const&) = delete;
1751 StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1752 StreamStateRestorer(StreamStateRestorer&&) = delete;
1753 StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1754
1755private:
1756 std::ostream& mStream;
1757 std::locale mLocale;
1758 std::streamsize const mPrecision;
1759 std::streamsize const mWidth;
1760 std::ostream::char_type const mFill;
1761 std::ostream::fmtflags const mFmtFlags;
1762};
1763ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1764
1765// Number formatter
1766class Number {
1767public:
1768 Number(int width, int precision, double value);
1769 Number(int width, int precision, int64_t value);
1770 std::string to_s() const;
1771
1772private:
1773 friend std::ostream& operator<<(std::ostream& os, Number const& n);
1774 std::ostream& write(std::ostream& os) const;
1775
1776 int mWidth;
1777 int mPrecision;
1778 double mValue;
1779};
1780
1781// helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1782std::string to_s(uint64_t s);
1783
1784std::ostream& operator<<(std::ostream& os, Number const& n);
1785
1786class MarkDownColumn {
1787public:
1788 MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val);
1789 std::string title() const;
1790 std::string separator() const;
1791 std::string invalid() const;
1792 std::string value() const;
1793
1794private:
1795 int mWidth;
1796 int mPrecision;
1797 std::string mTitle;
1798 std::string mSuffix;
1799 double mValue;
1800};
1801
1802// Formats any text as markdown code, escaping backticks.
1803class MarkDownCode {
1804public:
1805 explicit MarkDownCode(std::string const& what);
1806
1807private:
1808 friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1809 std::ostream& write(std::ostream& os) const;
1810
1811 std::string mWhat{};
1812};
1813
1814std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1815
1816} // namespace fmt
1817} // namespace detail
1818} // namespace nanobench
1819} // namespace ankerl
1820
1821// implementation /////////////////////////////////////////////////////////////////////////////////
1822
1823namespace ankerl {
1824namespace nanobench {
1825
1826void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1827 detail::fmt::StreamStateRestorer restorer(out);
1828
1829 out.precision(std::numeric_limits<double>::digits10);
1830 auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1831
1832 for (auto const& n : nodes) {
1833 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1834 switch (n.type) {
1835 case templates::Node::Type::content:
1836 out.write(n.begin, std::distance(n.begin, n.end));
1837 break;
1838
1839 case templates::Node::Type::inverted_section:
1840 throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
1841
1842 case templates::Node::Type::section:
1843 if (n == "result") {
1844 const size_t nbResults = results.size();
1845 for (size_t i = 0; i < nbResults; ++i) {
1846 generateResult(n.children, i, results, out);
1847 }
1848 } else if (n == "measurement") {
1849 if (results.size() != 1) {
1850 throw std::runtime_error(
1851 "render: can only use section 'measurement' here if there is a single result, but there are " +
1852 detail::fmt::to_s(results.size()));
1853 }
1854 // when we only have a single result, we can immediately go into its measurement.
1855 auto const& r = results.front();
1856 for (size_t i = 0; i < r.size(); ++i) {
1857 generateResultMeasurement(n.children, i, r, out);
1858 }
1859 } else {
1860 throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
1861 }
1862 break;
1863
1864 case templates::Node::Type::tag:
1865 if (results.size() == 1) {
1866 // result & config are both supported there
1867 generateResultTag(n, results.front(), out);
1868 } else {
1869 // This just uses the last result's config.
1870 if (!generateConfigTag(n, results.back().config(), out)) {
1871 throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
1872 }
1873 }
1874 break;
1875 }
1876 }
1877}
1878
1879void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1880 render(mustacheTemplate.c_str(), results, out);
1881}
1882
1883void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
1884 render(mustacheTemplate, bench.results(), out);
1885}
1886
1887void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
1888 render(mustacheTemplate.c_str(), bench.results(), out);
1889}
1890
1891namespace detail {
1892
1893PerformanceCounters& performanceCounters() {
1894# if defined(__clang__)
1895# pragma clang diagnostic push
1896# pragma clang diagnostic ignored "-Wexit-time-destructors"
1897# endif
1898 static PerformanceCounters pc;
1899# if defined(__clang__)
1900# pragma clang diagnostic pop
1901# endif
1902 return pc;
1903}
1904
1905// Windows version of doNotOptimizeAway
1906// see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
1907// see https://github.com/facebook/folly/blob/master/folly/Benchmark.h#L280
1908// see https://docs.microsoft.com/en-us/cpp/preprocessor/optimize
1909# if defined(_MSC_VER)
1910# pragma optimize("", off)
1911void doNotOptimizeAwaySink(void const*) {}
1912# pragma optimize("", on)
1913# endif
1914
1915template <typename T>
1916T parseFile(std::string const& filename) {
1917 std::ifstream fin(filename);
1918 T num{};
1919 fin >> num;
1920 return num;
1921}
1922
1923char const* getEnv(char const* name) {
1924# if defined(_MSC_VER)
1925# pragma warning(push)
1926# pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
1927# endif
1928 return std::getenv(name);
1929# if defined(_MSC_VER)
1930# pragma warning(pop)
1931# endif
1932}
1933
1934bool isEndlessRunning(std::string const& name) {
1935 auto endless = getEnv("NANOBENCH_ENDLESS");
1936 return nullptr != endless && endless == name;
1937}
1938
1939// True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
1940bool isWarningsEnabled() {
1941 auto suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
1942 return nullptr == suppression || suppression == std::string("0");
1943}
1944
1945void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
1946 warnings.clear();
1947 recommendations.clear();
1948
1949 bool recommendCheckFlags = false;
1950
1951# if defined(DEBUG)
1952 warnings.emplace_back("DEBUG defined");
1953 recommendCheckFlags = true;
1954# endif
1955
1956 bool recommendPyPerf = false;
1957# if defined(__linux__)
1958 auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
1959 if (nprocs <= 0) {
1960 warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
1961 } else {
1962
1963 // check frequency scaling
1964 for (long id = 0; id < nprocs; ++id) {
1965 auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
1966 auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
1967 auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq");
1968 auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq");
1969 if (minFreq != maxFreq) {
1970 auto minMHz = static_cast<double>(minFreq) / 1000.0;
1971 auto maxMHz = static_cast<double>(maxFreq) / 1000.0;
1972 warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
1973 detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
1974 " MHz");
1975 recommendPyPerf = true;
1976 break;
1977 }
1978 }
1979
1980 auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
1981 if ("performance" != currentGovernor) {
1982 warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
1983 recommendPyPerf = true;
1984 }
1985
1986 if (0 == parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo")) {
1987 warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
1988 recommendPyPerf = true;
1989 }
1990 }
1991# endif
1992
1993 if (recommendCheckFlags) {
1994 recommendations.emplace_back("Make sure you compile for Release");
1995 }
1996 if (recommendPyPerf) {
1997 recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
1998 }
1999}
2000
2001void printStabilityInformationOnce(std::ostream* outStream) {
2002 static bool shouldPrint = true;
2003 if (shouldPrint && outStream && isWarningsEnabled()) {
2004 auto& os = *outStream;
2005 shouldPrint = false;
2006 std::vector<std::string> warnings;
2007 std::vector<std::string> recommendations;
2008 gatherStabilityInformation(warnings, recommendations);
2009 if (warnings.empty()) {
2010 return;
2011 }
2012
2013 os << "Warning, results might be unstable:" << std::endl;
2014 for (auto const& w : warnings) {
2015 os << "* " << w << std::endl;
2016 }
2017
2018 os << std::endl << "Recommendations" << std::endl;
2019 for (auto const& r : recommendations) {
2020 os << "* " << r << std::endl;
2021 }
2022 }
2023}
2024
2025// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
2026uint64_t& singletonHeaderHash() noexcept {
2027 static uint64_t sHeaderHash{};
2028 return sHeaderHash;
2029}
2030
2031ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2032inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2033 return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2034}
2035
2036// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
2037Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
2038 auto bestDuration = Clock::duration::max();
2039 Clock::time_point tBegin;
2040 Clock::time_point tEnd;
2041 for (size_t i = 0; i < numEvaluations; ++i) {
2042 tBegin = Clock::now();
2043 do {
2044 tEnd = Clock::now();
2045 } while (tBegin == tEnd);
2046 bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2047 }
2048 return bestDuration;
2049}
2050
2051// Calculates clock resolution once, and remembers the result
2052Clock::duration clockResolution() noexcept {
2053 static Clock::duration sResolution = calcClockResolution(20);
2054 return sResolution;
2055}
2056
2057ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2058struct IterationLogic::Impl {
2059 enum class State { warmup, upscaling_runtime, measuring, endless };
2060
2061 explicit Impl(Bench const& bench)
2062 : mBench(bench)
2063 , mResult(bench.config()) {
2064 printStabilityInformationOnce(mBench.output());
2065
2066 // determine target runtime per epoch
2067 mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2068 if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2069 mTargetRuntimePerEpoch = mBench.maxEpochTime();
2070 }
2071 if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2072 mTargetRuntimePerEpoch = mBench.minEpochTime();
2073 }
2074
2075 if (isEndlessRunning(mBench.name())) {
2076 std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
2077 mNumIters = (std::numeric_limits<uint64_t>::max)();
2078 mState = State::endless;
2079 } else if (0 != mBench.warmup()) {
2080 mNumIters = mBench.warmup();
2081 mState = State::warmup;
2082 } else if (0 != mBench.epochIterations()) {
2083 // exact number of iterations
2084 mNumIters = mBench.epochIterations();
2085 mState = State::measuring;
2086 } else {
2087 mNumIters = mBench.minEpochIterations();
2088 mState = State::upscaling_runtime;
2089 }
2090 }
2091
2092 // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
2093 ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2094 auto doubleElapsed = d(elapsed);
2095 auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2096 auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2097
2098 auto doubleMinEpochIters = d(mBench.minEpochIterations());
2099 if (doubleNewIters < doubleMinEpochIters) {
2100 doubleNewIters = doubleMinEpochIters;
2101 }
2102 doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2103
2104 // +0.5 for correct rounding when casting
2105 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2106 return static_cast<uint64_t>(doubleNewIters + 0.5);
2107 }
2108
2109 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {
2110 if (elapsed * 10 < mTargetRuntimePerEpoch) {
2111 // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2112 if (mNumIters * 10 < mNumIters) {
2113 // overflow :-(
2114 showResult("iterations overflow. Maybe your code got optimized away?");
2115 mNumIters = 0;
2116 return;
2117 }
2118 mNumIters *= 10;
2119 } else {
2120 mNumIters = calcBestNumIters(elapsed, mNumIters);
2121 }
2122 }
2123
2124 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2125# if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2126 auto oldIters = mNumIters;
2127# endif
2128
2129 switch (mState) {
2130 case State::warmup:
2131 if (isCloseEnoughForMeasurements(elapsed)) {
2132 // if elapsed is close enough, we can skip upscaling and go right to measurements
2133 // still, we don't add the result to the measurements.
2134 mState = State::measuring;
2135 mNumIters = calcBestNumIters(elapsed, mNumIters);
2136 } else {
2137 // not close enough: switch to upscaling
2138 mState = State::upscaling_runtime;
2139 upscale(elapsed);
2140 }
2141 break;
2142
2143 case State::upscaling_runtime:
2144 if (isCloseEnoughForMeasurements(elapsed)) {
2145 // if we are close enough, add measurement and switch to always measuring
2146 mState = State::measuring;
2147 mTotalElapsed += elapsed;
2148 mTotalNumIters += mNumIters;
2149 mResult.add(elapsed, mNumIters, pc);
2150 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2151 } else {
2152 upscale(elapsed);
2153 }
2154 break;
2155
2156 case State::measuring:
2157 // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2158 // that fluctuation, or else we would bias the result
2159 mTotalElapsed += elapsed;
2160 mTotalNumIters += mNumIters;
2161 mResult.add(elapsed, mNumIters, pc);
2162 if (0 != mBench.epochIterations()) {
2163 mNumIters = mBench.epochIterations();
2164 } else {
2165 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2166 }
2167 break;
2168
2169 case State::endless:
2170 mNumIters = (std::numeric_limits<uint64_t>::max)();
2171 break;
2172 }
2173
2174 if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2175 // we got all the results that we need, finish it
2176 showResult("");
2177 mNumIters = 0;
2178 }
2179
2180 ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, static_cast<double>(elapsed.count())) << " elapsed, "
2181 << detail::fmt::Number(20, 3, static_cast<double>(mTargetRuntimePerEpoch.count()))
2182 << " target. oldIters=" << oldIters << ", mNumIters=" << mNumIters
2183 << ", mState=" << static_cast<int>(mState));
2184 }
2185
2186 void showResult(std::string const& errorMessage) const {
2187 ANKERL_NANOBENCH_LOG(errorMessage);
2188
2189 if (mBench.output() != nullptr) {
2190 // prepare column data ///////
2191 std::vector<fmt::MarkDownColumn> columns;
2192
2193 auto rMedian = mResult.median(Result::Measure::elapsed);
2194
2195 if (mBench.relative()) {
2196 double d = 100.0;
2197 if (!mBench.results().empty()) {
2198 d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2199 }
2200 columns.emplace_back(11, 1, "relative", "%", d);
2201 }
2202
2203 if (mBench.complexityN() > 0) {
2204 columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2205 }
2206
2207 columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
2208 rMedian / (mBench.timeUnit().count() * mBench.batch()));
2209 columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2210
2211 double rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2212 columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2213
2214 double rInsMedian = -1.0;
2215 if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) {
2216 rInsMedian = mResult.median(Result::Measure::instructions);
2217 columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2218 }
2219
2220 double rCycMedian = -1.0;
2221 if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) {
2222 rCycMedian = mResult.median(Result::Measure::cpucycles);
2223 columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2224 }
2225 if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2226 columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2227 }
2228 if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) {
2229 double rBraMedian = mResult.median(Result::Measure::branchinstructions);
2230 columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2231 if (mResult.has(Result::Measure::branchmisses)) {
2232 double p = 0.0;
2233 if (rBraMedian >= 1e-9) {
2234 p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2235 }
2236 columns.emplace_back(10, 1, "miss%", "%", p);
2237 }
2238 }
2239
2240 columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
2241
2242 // write everything
2243 auto& os = *mBench.output();
2244
2245 // combine all elements that are relevant for printing the header
2246 uint64_t hash = 0;
2247 hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2248 hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2249 hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2250 hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2251 hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2252 hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2253
2254 if (hash != singletonHeaderHash()) {
2255 singletonHeaderHash() = hash;
2256
2257 // no result yet, print header
2258 os << std::endl;
2259 for (auto const& col : columns) {
2260 os << col.title();
2261 }
2262 os << "| " << mBench.title() << std::endl;
2263
2264 for (auto const& col : columns) {
2265 os << col.separator();
2266 }
2267 os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2268 }
2269
2270 if (!errorMessage.empty()) {
2271 for (auto const& col : columns) {
2272 os << col.invalid();
2273 }
2274 os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2275 } else {
2276 for (auto const& col : columns) {
2277 os << col.value();
2278 }
2279 os << "| ";
2280 auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2281 if (showUnstable) {
2282 os << ":wavy_dash: ";
2283 }
2284 os << fmt::MarkDownCode(mBench.name());
2285 if (showUnstable) {
2286 auto avgIters = static_cast<double>(mTotalNumIters) / static_cast<double>(mBench.epochs());
2287 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2288 auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2289
2290 os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2291 << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2292 }
2293 os << std::endl;
2294 }
2295 }
2296 }
2297
2298 ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2299 return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2300 }
2301
2302 uint64_t mNumIters = 1;
2303 Bench const& mBench;
2304 std::chrono::nanoseconds mTargetRuntimePerEpoch{};
2305 Result mResult;
2306 Rng mRng{123};
2307 std::chrono::nanoseconds mTotalElapsed{};
2308 uint64_t mTotalNumIters = 0;
2309
2310 State mState = State::upscaling_runtime;
2311};
2312ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2313
2314IterationLogic::IterationLogic(Bench const& bench) noexcept
2315 : mPimpl(new Impl(bench)) {}
2316
2317IterationLogic::~IterationLogic() {
2318 if (mPimpl) {
2319 delete mPimpl;
2320 }
2321}
2322
2323uint64_t IterationLogic::numIters() const noexcept {
2324 ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2325 return mPimpl->mNumIters;
2326}
2327
2328void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2329 mPimpl->add(elapsed, pc);
2330}
2331
2332void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2333 results.emplace_back(std::move(mPimpl->mResult));
2334}
2335
2336# if ANKERL_NANOBENCH(PERF_COUNTERS)
2337
2338ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2339class LinuxPerformanceCounters {
2340public:
2341 struct Target {
2342 Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2343 : targetValue(targetValue_)
2344 , correctMeasuringOverhead(correctMeasuringOverhead_)
2345 , correctLoopOverhead(correctLoopOverhead_) {}
2346
2347 uint64_t* targetValue{};
2348 bool correctMeasuringOverhead{};
2349 bool correctLoopOverhead{};
2350 };
2351
2352 ~LinuxPerformanceCounters();
2353
2354 // quick operation
2355 inline void start() {}
2356
2357 inline void stop() {}
2358
2359 bool monitor(perf_sw_ids swId, Target target);
2360 bool monitor(perf_hw_id hwId, Target target);
2361
2362 bool hasError() const noexcept {
2363 return mHasError;
2364 }
2365
2366 // Just reading data is faster than enable & disabling.
2367 // we subtract data ourselves.
2368 inline void beginMeasure() {
2369 if (mHasError) {
2370 return;
2371 }
2372
2373 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2374 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2375 if (mHasError) {
2376 return;
2377 }
2378
2379 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2380 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2381 }
2382
2383 inline void endMeasure() {
2384 if (mHasError) {
2385 return;
2386 }
2387
2388 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2389 mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2390 if (mHasError) {
2391 return;
2392 }
2393
2394 auto const numBytes = sizeof(uint64_t) * mCounters.size();
2395 auto ret = read(mFd, mCounters.data(), numBytes);
2396 mHasError = ret != static_cast<ssize_t>(numBytes);
2397 }
2398
2399 void updateResults(uint64_t numIters);
2400
2401 // rounded integer division
2402 template <typename T>
2403 static inline T divRounded(T a, T divisor) {
2404 return (a + divisor / 2) / divisor;
2405 }
2406
2407 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2408 static inline uint32_t mix(uint32_t x) noexcept {
2409 x ^= x << 13;
2410 x ^= x >> 17;
2411 x ^= x << 5;
2412 return x;
2413 }
2414
2415 template <typename Op>
2416 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2417 void calibrate(Op&& op) {
2418 // clear current calibration data,
2419 for (auto& v : mCalibratedOverhead) {
2420 v = UINT64_C(0);
2421 }
2422
2423 // create new calibration data
2424 auto newCalibration = mCalibratedOverhead;
2425 for (auto& v : newCalibration) {
2426 v = (std::numeric_limits<uint64_t>::max)();
2427 }
2428 for (size_t iter = 0; iter < 100; ++iter) {
2429 beginMeasure();
2430 op();
2431 endMeasure();
2432 if (mHasError) {
2433 return;
2434 }
2435
2436 for (size_t i = 0; i < newCalibration.size(); ++i) {
2437 auto diff = mCounters[i];
2438 if (newCalibration[i] > diff) {
2439 newCalibration[i] = diff;
2440 }
2441 }
2442 }
2443
2444 mCalibratedOverhead = std::move(newCalibration);
2445
2446 {
2447 // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2448 // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2449 // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2450 // see https://godbolt.org/z/49RVQ5
2451 uint64_t const numIters = 100000U + (std::random_device{}() & 3);
2452 uint64_t n = numIters;
2453 uint32_t x = 1234567;
2454
2455 beginMeasure();
2456 while (n-- > 0) {
2457 x = mix(x);
2458 }
2459 endMeasure();
2461 auto measure1 = mCounters;
2462
2463 n = numIters;
2464 beginMeasure();
2465 while (n-- > 0) {
2466 // we now run *twice* so we can easily calculate the overhead
2467 x = mix(x);
2468 x = mix(x);
2469 }
2470 endMeasure();
2472 auto measure2 = mCounters;
2473
2474 for (size_t i = 0; i < mCounters.size(); ++i) {
2475 // factor 2 because we have two instructions per loop
2476 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2477 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2478 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2479
2480 mLoopOverhead[i] = divRounded(overhead, numIters);
2481 }
2482 }
2483 }
2484
2485private:
2486 bool monitor(uint32_t type, uint64_t eventid, Target target);
2487
2488 std::map<uint64_t, Target> mIdToTarget{};
2489
2490 // start with minimum size of 3 for read_format
2491 std::vector<uint64_t> mCounters{3};
2492 std::vector<uint64_t> mCalibratedOverhead{3};
2493 std::vector<uint64_t> mLoopOverhead{3};
2494
2495 uint64_t mTimeEnabledNanos = 0;
2496 uint64_t mTimeRunningNanos = 0;
2497 int mFd = -1;
2498 bool mHasError = false;
2499};
2500ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2501
2502LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2503 if (-1 != mFd) {
2504 close(mFd);
2505 }
2506}
2507
2508bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2509 return monitor(PERF_TYPE_SOFTWARE, swId, target);
2510}
2511
2512bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2513 return monitor(PERF_TYPE_HARDWARE, hwId, target);
2514}
2515
2516// overflow is ok, it's checked
2517ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2518void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2519 // clear old data
2520 for (auto& id_value : mIdToTarget) {
2521 *id_value.second.targetValue = UINT64_C(0);
2522 }
2523
2524 if (mHasError) {
2525 return;
2526 }
2527
2528 mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2529 mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2530
2531 for (uint64_t i = 0; i < mCounters[0]; ++i) {
2532 auto idx = static_cast<size_t>(3 + i * 2 + 0);
2533 auto id = mCounters[idx + 1U];
2534
2535 auto it = mIdToTarget.find(id);
2536 if (it != mIdToTarget.end()) {
2537
2538 auto& tgt = it->second;
2539 *tgt.targetValue = mCounters[idx];
2540 if (tgt.correctMeasuringOverhead) {
2541 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2542 *tgt.targetValue -= mCalibratedOverhead[idx];
2543 } else {
2544 *tgt.targetValue = 0U;
2545 }
2546 }
2547 if (tgt.correctLoopOverhead) {
2548 auto correctionVal = mLoopOverhead[idx] * numIters;
2549 if (*tgt.targetValue >= correctionVal) {
2550 *tgt.targetValue -= correctionVal;
2551 } else {
2552 *tgt.targetValue = 0U;
2553 }
2554 }
2555 }
2556 }
2557}
2558
2559bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2560 *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2561 if (mHasError) {
2562 return false;
2563 }
2564
2565 auto pea = perf_event_attr();
2566 std::memset(&pea, 0, sizeof(perf_event_attr));
2567 pea.type = type;
2568 pea.size = sizeof(perf_event_attr);
2569 pea.config = eventid;
2570 pea.disabled = 1; // start counter as disabled
2571 pea.exclude_kernel = 1;
2572 pea.exclude_hv = 1;
2573
2574 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2575 pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2576
2577 const int pid = 0; // the current process
2578 const int cpu = -1; // all CPUs
2579# if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2580 const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2581# else
2582 const unsigned long flags = 0;
2583# endif
2584
2585 auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2586 if (-1 == fd) {
2587 return false;
2588 }
2589 if (-1 == mFd) {
2590 // first call: set to fd, and use this from now on
2591 mFd = fd;
2592 }
2593 uint64_t id = 0;
2594 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2595 if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2596 // couldn't get id
2597 return false;
2598 }
2599
2600 // insert into map, rely on the fact that map's references are constant.
2601 mIdToTarget.emplace(id, target);
2602
2603 // prepare readformat with the correct size (after the insert)
2604 auto size = 3 + 2 * mIdToTarget.size();
2605 mCounters.resize(size);
2606 mCalibratedOverhead.resize(size);
2607 mLoopOverhead.resize(size);
2608
2609 return true;
2610}
2611
2612PerformanceCounters::PerformanceCounters()
2613 : mPc(new LinuxPerformanceCounters())
2614 , mVal()
2615 , mHas() {
2616
2617 mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2618 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2619 mHas.contextSwitches =
2620 mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2621 mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2622 mHas.branchInstructions =
2623 mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2624 mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2625 // mHas.branchMisses = false;
2626
2627 mPc->start();
2628 mPc->calibrate([] {
2629 auto before = ankerl::nanobench::Clock::now();
2630 auto after = ankerl::nanobench::Clock::now();
2631 (void)before;
2632 (void)after;
2633 });
2634
2635 if (mPc->hasError()) {
2636 // something failed, don't monitor anything.
2637 mHas = PerfCountSet<bool>{};
2638 }
2639}
2640
2641PerformanceCounters::~PerformanceCounters() {
2642 if (nullptr != mPc) {
2643 delete mPc;
2644 }
2645}
2646
2647void PerformanceCounters::beginMeasure() {
2648 mPc->beginMeasure();
2649}
2650
2651void PerformanceCounters::endMeasure() {
2652 mPc->endMeasure();
2653}
2654
2655void PerformanceCounters::updateResults(uint64_t numIters) {
2656 mPc->updateResults(numIters);
2657}
2658
2659# else
2660
2661PerformanceCounters::PerformanceCounters() = default;
2662PerformanceCounters::~PerformanceCounters() = default;
2663void PerformanceCounters::beginMeasure() {}
2664void PerformanceCounters::endMeasure() {}
2665void PerformanceCounters::updateResults(uint64_t) {}
2666
2667# endif
2668
2669ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2670 return mVal;
2671}
2672ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2673 return mHas;
2674}
2675
2676// formatting utilities
2677namespace fmt {
2678
2679// adds thousands separator to numbers
2680NumSep::NumSep(char sep)
2681 : mSep(sep) {}
2682
2683char NumSep::do_thousands_sep() const {
2684 return mSep;
2685}
2686
2687std::string NumSep::do_grouping() const {
2688 return "\003";
2689}
2690
2691// RAII to save & restore a stream's state
2692StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2693 : mStream(s)
2694 , mLocale(s.getloc())
2695 , mPrecision(s.precision())
2696 , mWidth(s.width())
2697 , mFill(s.fill())
2698 , mFmtFlags(s.flags()) {}
2699
2700StreamStateRestorer::~StreamStateRestorer() {
2701 restore();
2702}
2703
2704// sets back all stream info that we remembered at construction
2705void StreamStateRestorer::restore() {
2706 mStream.imbue(mLocale);
2707 mStream.precision(mPrecision);
2708 mStream.width(mWidth);
2709 mStream.fill(mFill);
2710 mStream.flags(mFmtFlags);
2711}
2712
2713Number::Number(int width, int precision, int64_t value)
2714 : mWidth(width)
2715 , mPrecision(precision)
2716 , mValue(static_cast<double>(value)) {}
2717
2718Number::Number(int width, int precision, double value)
2719 : mWidth(width)
2720 , mPrecision(precision)
2721 , mValue(value) {}
2722
2723std::ostream& Number::write(std::ostream& os) const {
2724 StreamStateRestorer restorer(os);
2725 os.imbue(std::locale(os.getloc(), new NumSep(',')));
2726 os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2727 return os;
2728}
2729
2730std::string Number::to_s() const {
2731 std::stringstream ss;
2732 write(ss);
2733 return ss.str();
2734}
2735
2736std::string to_s(uint64_t n) {
2737 std::string str;
2738 do {
2739 str += static_cast<char>('0' + static_cast<char>(n % 10));
2740 n /= 10;
2741 } while (n != 0);
2742 std::reverse(str.begin(), str.end());
2743 return str;
2744}
2745
2746std::ostream& operator<<(std::ostream& os, Number const& n) {
2747 return n.write(os);
2748}
2749
2750MarkDownColumn::MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val)
2751 : mWidth(w)
2752 , mPrecision(prec)
2753 , mTitle(tit)
2754 , mSuffix(suff)
2755 , mValue(val) {}
2756
2757std::string MarkDownColumn::title() const {
2758 std::stringstream ss;
2759 ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2760 return ss.str();
2761}
2762
2763std::string MarkDownColumn::separator() const {
2764 std::string sep(static_cast<size_t>(mWidth), '-');
2765 sep.front() = '|';
2766 sep.back() = ':';
2767 return sep;
2768}
2769
2770std::string MarkDownColumn::invalid() const {
2771 std::string sep(static_cast<size_t>(mWidth), ' ');
2772 sep.front() = '|';
2773 sep[sep.size() - 2] = '-';
2774 return sep;
2775}
2776
2777std::string MarkDownColumn::value() const {
2778 std::stringstream ss;
2779 auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2780 ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2781 return ss.str();
2782}
2783
2784// Formats any text as markdown code, escaping backticks.
2785MarkDownCode::MarkDownCode(std::string const& what) {
2786 mWhat.reserve(what.size() + 2);
2787 mWhat.push_back('`');
2788 for (char c : what) {
2789 mWhat.push_back(c);
2790 if ('`' == c) {
2791 mWhat.push_back('`');
2792 }
2793 }
2794 mWhat.push_back('`');
2795}
2796
2797std::ostream& MarkDownCode::write(std::ostream& os) const {
2798 return os << mWhat;
2799}
2800
2801std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2802 return mdCode.write(os);
2803}
2804} // namespace fmt
2805} // namespace detail
2806
2807// provide implementation here so it's only generated once
2808Config::Config() = default;
2809Config::~Config() = default;
2810Config& Config::operator=(Config const&) = default;
2811Config& Config::operator=(Config&&) = default;
2812Config::Config(Config const&) = default;
2813Config::Config(Config&&) noexcept = default;
2814
2815// provide implementation here so it's only generated once
2816Result::~Result() = default;
2817Result& Result::operator=(Result const&) = default;
2818Result& Result::operator=(Result&&) = default;
2819Result::Result(Result const&) = default;
2820Result::Result(Result&&) noexcept = default;
2821
2822namespace detail {
2823template <typename T>
2824inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2825 return static_cast<typename std::underlying_type<T>::type>(val);
2826}
2827} // namespace detail
2828
2829// Result returned after a benchmark has finished. Can be used as a baseline for relative().
2830Result::Result(Config const& benchmarkConfig)
2831 : mConfig(benchmarkConfig)
2832 , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2833
2834void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
2835 using detail::d;
2836 using detail::u;
2837
2838 double dIters = d(iters);
2839 mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2840
2841 mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2842 if (pc.has().pageFaults) {
2843 mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2844 }
2845 if (pc.has().cpuCycles) {
2846 mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2847 }
2848 if (pc.has().contextSwitches) {
2849 mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2850 }
2851 if (pc.has().instructions) {
2852 mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2853 }
2854 if (pc.has().branchInstructions) {
2855 double branchInstructions = 0.0;
2856 // correcting branches: remove branch introduced by the while (...) loop for each iteration.
2857 if (pc.val().branchInstructions > iters + 1U) {
2858 branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2859 }
2860 mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2861
2862 if (pc.has().branchMisses) {
2863 // correcting branch misses
2864 double branchMisses = d(pc.val().branchMisses);
2865 if (branchMisses > branchInstructions) {
2866 // can't have branch misses when there were branches...
2867 branchMisses = branchInstructions;
2868 }
2869
2870 // assuming at least one missed branch for the loop
2871 branchMisses -= 1.0;
2872 if (branchMisses < 1.0) {
2873 branchMisses = 1.0;
2874 }
2875 mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2876 }
2877 }
2878}
2879
2880Config const& Result::config() const noexcept {
2881 return mConfig;
2882}
2883
2884inline double calcMedian(std::vector<double>& data) {
2885 if (data.empty()) {
2886 return 0.0;
2887 }
2888 std::sort(data.begin(), data.end());
2889
2890 auto midIdx = data.size() / 2U;
2891 if (1U == (data.size() & 1U)) {
2892 return data[midIdx];
2893 }
2894 return (data[midIdx - 1U] + data[midIdx]) / 2U;
2895}
2896
2897double Result::median(Measure m) const {
2898 // create a copy so we can sort
2899 auto data = mNameToMeasurements[detail::u(m)];
2900 return calcMedian(data);
2901}
2902
2903double Result::average(Measure m) const {
2904 using detail::d;
2905 auto const& data = mNameToMeasurements[detail::u(m)];
2906 if (data.empty()) {
2907 return 0.0;
2908 }
2909
2910 // create a copy so we can sort
2911 return sum(m) / d(data.size());
2912}
2913
2914double Result::medianAbsolutePercentError(Measure m) const {
2915 // create copy
2916 auto data = mNameToMeasurements[detail::u(m)];
2917
2918 // calculates MdAPE which is the median of percentage error
2919 // see https://www.spiderfinancial.com/support/documentation/numxl/reference-manual/forecasting-performance/mdape
2920 auto med = calcMedian(data);
2921
2922 // transform the data to absolute error
2923 for (auto& x : data) {
2924 x = (x - med) / x;
2925 if (x < 0) {
2926 x = -x;
2927 }
2928 }
2929 return calcMedian(data);
2930}
2931
2932double Result::sum(Measure m) const noexcept {
2933 auto const& data = mNameToMeasurements[detail::u(m)];
2934 return std::accumulate(data.begin(), data.end(), 0.0);
2935}
2936
2937double Result::sumProduct(Measure m1, Measure m2) const noexcept {
2938 auto const& data1 = mNameToMeasurements[detail::u(m1)];
2939 auto const& data2 = mNameToMeasurements[detail::u(m2)];
2940
2941 if (data1.size() != data2.size()) {
2942 return 0.0;
2943 }
2944
2945 double result = 0.0;
2946 for (size_t i = 0, s = data1.size(); i != s; ++i) {
2947 result += data1[i] * data2[i];
2948 }
2949 return result;
2950}
2951
2952bool Result::has(Measure m) const noexcept {
2953 return !mNameToMeasurements[detail::u(m)].empty();
2954}
2955
2956double Result::get(size_t idx, Measure m) const {
2957 auto const& data = mNameToMeasurements[detail::u(m)];
2958 return data.at(idx);
2959}
2960
2961bool Result::empty() const noexcept {
2962 return 0U == size();
2963}
2964
2965size_t Result::size() const noexcept {
2966 auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
2967 return data.size();
2968}
2969
2970double Result::minimum(Measure m) const noexcept {
2971 auto const& data = mNameToMeasurements[detail::u(m)];
2972 if (data.empty()) {
2973 return 0.0;
2974 }
2975
2976 // here its save to assume that at least one element is there
2977 return *std::min_element(data.begin(), data.end());
2978}
2979
2980double Result::maximum(Measure m) const noexcept {
2981 auto const& data = mNameToMeasurements[detail::u(m)];
2982 if (data.empty()) {
2983 return 0.0;
2984 }
2985
2986 // here its save to assume that at least one element is there
2987 return *std::max_element(data.begin(), data.end());
2988}
2989
2990Result::Measure Result::fromString(std::string const& str) {
2991 if (str == "elapsed") {
2992 return Measure::elapsed;
2993 } else if (str == "iterations") {
2994 return Measure::iterations;
2995 } else if (str == "pagefaults") {
2996 return Measure::pagefaults;
2997 } else if (str == "cpucycles") {
2998 return Measure::cpucycles;
2999 } else if (str == "contextswitches") {
3000 return Measure::contextswitches;
3001 } else if (str == "instructions") {
3002 return Measure::instructions;
3003 } else if (str == "branchinstructions") {
3004 return Measure::branchinstructions;
3005 } else if (str == "branchmisses") {
3006 return Measure::branchmisses;
3007 } else {
3008 // not found, return _size
3009 return Measure::_size;
3010 }
3011}
3012
3013// Configuration of a microbenchmark.
3014Bench::Bench() {
3015 mConfig.mOut = &std::cout;
3016}
3017
3018Bench::Bench(Bench&&) = default;
3019Bench& Bench::operator=(Bench&&) = default;
3020Bench::Bench(Bench const&) = default;
3021Bench& Bench::operator=(Bench const&) = default;
3022Bench::~Bench() noexcept = default;
3023
3024double Bench::batch() const noexcept {
3025 return mConfig.mBatch;
3026}
3027
3028double Bench::complexityN() const noexcept {
3029 return mConfig.mComplexityN;
3030}
3031
3032// Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
3033// means it is slower than the baseline.
3034Bench& Bench::relative(bool isRelativeEnabled) noexcept {
3035 mConfig.mIsRelative = isRelativeEnabled;
3036 return *this;
3037}
3038bool Bench::relative() const noexcept {
3039 return mConfig.mIsRelative;
3040}
3041
3042Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
3043 mConfig.mShowPerformanceCounters = showPerformanceCounters;
3044 return *this;
3045}
3046bool Bench::performanceCounters() const noexcept {
3047 return mConfig.mShowPerformanceCounters;
3048}
3049
3050// Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
3051// If u differs from currently set unit, the stored results will be cleared.
3052// Use singular (byte, not bytes).
3053Bench& Bench::unit(char const* u) {
3054 if (u != mConfig.mUnit) {
3055 mResults.clear();
3056 }
3057 mConfig.mUnit = u;
3058 return *this;
3059}
3060
3061Bench& Bench::unit(std::string const& u) {
3062 return unit(u.c_str());
3063}
3064
3065std::string const& Bench::unit() const noexcept {
3066 return mConfig.mUnit;
3067}
3068
3069Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
3070 mConfig.mTimeUnit = tu;
3071 mConfig.mTimeUnitName = tuName;
3072 return *this;
3073}
3074
3075std::string const& Bench::timeUnitName() const noexcept {
3076 return mConfig.mTimeUnitName;
3077}
3078
3079std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
3080 return mConfig.mTimeUnit;
3081}
3082
3083// If benchmarkTitle differs from currently set title, the stored results will be cleared.
3084Bench& Bench::title(const char* benchmarkTitle) {
3085 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3086 mResults.clear();
3087 }
3088 mConfig.mBenchmarkTitle = benchmarkTitle;
3089 return *this;
3090}
3091Bench& Bench::title(std::string const& benchmarkTitle) {
3092 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3093 mResults.clear();
3094 }
3095 mConfig.mBenchmarkTitle = benchmarkTitle;
3096 return *this;
3097}
3098
3099std::string const& Bench::title() const noexcept {
3100 return mConfig.mBenchmarkTitle;
3101}
3102
3103Bench& Bench::name(const char* benchmarkName) {
3104 mConfig.mBenchmarkName = benchmarkName;
3105 return *this;
3106}
3107
3108Bench& Bench::name(std::string const& benchmarkName) {
3109 mConfig.mBenchmarkName = benchmarkName;
3110 return *this;
3111}
3112
3113std::string const& Bench::name() const noexcept {
3114 return mConfig.mBenchmarkName;
3115}
3116
3117// Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
3118Bench& Bench::epochs(size_t numEpochs) noexcept {
3119 mConfig.mNumEpochs = numEpochs;
3120 return *this;
3121}
3122size_t Bench::epochs() const noexcept {
3123 return mConfig.mNumEpochs;
3124}
3125
3126// Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
3127Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3128 mConfig.mClockResolutionMultiple = multiple;
3129 return *this;
3130}
3131size_t Bench::clockResolutionMultiple() const noexcept {
3132 return mConfig.mClockResolutionMultiple;
3133}
3134
3135// Sets the maximum time each epoch should take. Default is 100ms.
3136Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3137 mConfig.mMaxEpochTime = t;
3138 return *this;
3139}
3140std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3141 return mConfig.mMaxEpochTime;
3142}
3143
3144// Sets the maximum time each epoch should take. Default is 100ms.
3145Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3146 mConfig.mMinEpochTime = t;
3147 return *this;
3148}
3149std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3150 return mConfig.mMinEpochTime;
3151}
3152
3153Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3154 mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3155 return *this;
3156}
3157uint64_t Bench::minEpochIterations() const noexcept {
3158 return mConfig.mMinEpochIterations;
3159}
3160
3161Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3162 mConfig.mEpochIterations = numIters;
3163 return *this;
3164}
3165uint64_t Bench::epochIterations() const noexcept {
3166 return mConfig.mEpochIterations;
3167}
3168
3169Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3170 mConfig.mWarmup = numWarmupIters;
3171 return *this;
3172}
3173uint64_t Bench::warmup() const noexcept {
3174 return mConfig.mWarmup;
3175}
3176
3177Bench& Bench::config(Config const& benchmarkConfig) {
3178 mConfig = benchmarkConfig;
3179 return *this;
3180}
3181Config const& Bench::config() const noexcept {
3182 return mConfig;
3183}
3184
3185Bench& Bench::output(std::ostream* outstream) noexcept {
3186 mConfig.mOut = outstream;
3187 return *this;
3188}
3189
3190ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3191 return mConfig.mOut;
3192}
3193
3194std::vector<Result> const& Bench::results() const noexcept {
3195 return mResults;
3196}
3197
3198Bench& Bench::render(char const* templateContent, std::ostream& os) {
3199 ::ankerl::nanobench::render(templateContent, *this, os);
3200 return *this;
3201}
3202
3203Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
3204 ::ankerl::nanobench::render(templateContent, *this, os);
3205 return *this;
3206}
3207
3208std::vector<BigO> Bench::complexityBigO() const {
3209 std::vector<BigO> bigOs;
3210 auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3211 bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3212 return 1.0;
3213 });
3214 bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3215 return n;
3216 });
3217 bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3218 return std::log2(n);
3219 });
3220 bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3221 return n * std::log2(n);
3222 });
3223 bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3224 return n * n;
3225 });
3226 bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3227 return n * n * n;
3228 });
3229 std::sort(bigOs.begin(), bigOs.end());
3230 return bigOs;
3231}
3232
3233Rng::Rng()
3234 : mX(0)
3235 , mY(0) {
3236 std::random_device rd;
3237 std::uniform_int_distribution<uint64_t> dist;
3238 do {
3239 mX = dist(rd);
3240 mY = dist(rd);
3241 } while (mX == 0 && mY == 0);
3242}
3243
3244ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
3245uint64_t splitMix64(uint64_t& state) noexcept {
3246 uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3247 z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3248 z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3249 return z ^ (z >> 31U);
3250}
3251
3252// Seeded as described in romu paper (update april 2020)
3253Rng::Rng(uint64_t seed) noexcept
3254 : mX(splitMix64(seed))
3255 , mY(splitMix64(seed)) {
3256 for (size_t i = 0; i < 10; ++i) {
3257 operator()();
3258 }
3259}
3260
3261// only internally used to copy the RNG.
3262Rng::Rng(uint64_t x, uint64_t y) noexcept
3263 : mX(x)
3264 , mY(y) {}
3265
3266Rng Rng::copy() const noexcept {
3267 return Rng{mX, mY};
3268}
3269
3270Rng::Rng(std::vector<uint64_t> const& data)
3271 : mX(0)
3272 , mY(0) {
3273 if (data.size() != 2) {
3274 throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3275 detail::fmt::to_s(data.size()));
3276 }
3277 mX = data[0];
3278 mY = data[1];
3279}
3280
3281std::vector<uint64_t> Rng::state() const {
3282 std::vector<uint64_t> data(2);
3283 data[0] = mX;
3284 data[1] = mY;
3285 return data;
3286}
3287
3288BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3289 BigO::RangeMeasure rangeMeasure;
3290 for (auto const& result : results) {
3291 if (result.config().mComplexityN > 0.0) {
3292 rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3293 }
3294 }
3295 return rangeMeasure;
3296}
3297
3298BigO::BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure)
3299 : mName(bigOName) {
3300
3301 // estimate the constant factor
3302 double sumRangeMeasure = 0.0;
3303 double sumRangeRange = 0.0;
3304
3305 for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3306 sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second;
3307 sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first;
3308 }
3309 mConstant = sumRangeMeasure / sumRangeRange;
3310
3311 // calculate root mean square
3312 double err = 0.0;
3313 double sumMeasure = 0.0;
3314 for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3315 auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second;
3316 err += diff * diff;
3317
3318 sumMeasure += rangeMeasure[i].second;
3319 }
3320
3321 auto n = static_cast<double>(rangeMeasure.size());
3322 auto mean = sumMeasure / n;
3323 mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3324}
3325
3326BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3327 : BigO(std::string(bigOName), rangeMeasure) {}
3328
3329std::string const& BigO::name() const noexcept {
3330 return mName;
3331}
3332
3333double BigO::constant() const noexcept {
3334 return mConstant;
3335}
3336
3337double BigO::normalizedRootMeanSquare() const noexcept {
3338 return mNormalizedRootMeanSquare;
3339}
3340
3341bool BigO::operator<(BigO const& other) const noexcept {
3342 return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3343}
3344
3345std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3346 return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3347}
3348
3349std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3350 detail::fmt::StreamStateRestorer restorer(os);
3351 os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3352 for (auto const& bigO : bigOs) {
3353 os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3354 os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3355 os << "| " << bigO.name();
3356 os << std::endl;
3357 }
3358 return os;
3359}
3360
3361} // namespace nanobench
3362} // namespace ankerl
3363
3364#endif // ANKERL_NANOBENCH_IMPLEMENT
3365#endif // ANKERL_NANOBENCH_H_INCLUDED
int flags
Definition: bitcoin-tx.cpp:541
Definition: config.h:19
Config()=default
Config & operator=(const Config &)=delete
Main entry point to nanobench's benchmarking facility.
Definition: nanobench.h:616
Bench & complexityN(T b) noexcept
Definition: nanobench.h:1214
Bench & operator=(Bench const &other)
ANKERL_NANOBENCH(NODISCARD) std Bench & batch(T b) noexcept
Sets the batch size.
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Definition: nanobench.h:1183
Bench & operator=(Bench &&other)
std::vector< BigO > complexityBigO() const
Bench()
Creates a new benchmark for configuration and running of benchmarks.
Bench(Bench const &other)
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
Definition: nanobench.h:1070
std::vector< std::pair< double, double > > RangeMeasure
Definition: nanobench.h:1067
BigO(std::string const &bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1084
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1080
BigO(std::string const &bigOName, RangeMeasure const &scaledRangeMeasure)
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
BigO(char const *bigOName, RangeMeasure const &scaledRangeMeasure)
Result(Result const &)
static Measure fromString(std::string const &str)
Result(Config const &benchmarkConfig)
Result & operator=(Result const &)
Result & operator=(Result &&)
Result(Result &&) noexcept
An extremely fast random generator.
Definition: nanobench.h:477
static constexpr uint64_t() min()
Rng(Rng const &)=delete
As a safety precausion, we don't allow copying.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Definition: nanobench.h:1145
Rng(Rng &&) noexcept=default
Rng & operator=(Rng const &)=delete
Same as Rng(Rng const&), we don't allow assignment.
static constexpr uint64_t() max()
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
Definition: nanobench.h:1135
uint64_t result_type
This RNG provides 64bit randomness.
Definition: nanobench.h:482
void moveResultTo(std::vector< Result > &results) noexcept
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept
IterationLogic(Bench const &config) noexcept
PerformanceCounters(PerformanceCounters const &)=delete
PerformanceCounters & operator=(PerformanceCounters const &)=delete
ANKERL_NANOBENCH(NODISCARD) PerfCountSet< uint64_t > const &val() const noexcept
volatile double sum
Definition: examples.cpp:10
PerformanceCounters & performanceCounters()
void doNotOptimizeAway(T const &val)
Definition: nanobench.h:999
char const * json() noexcept
Template to generate JSON data.
char const * csv() noexcept
CSV data for the benchmark results.
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzations.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
Definition: nanobench.h:128
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Definition: nanobench.h:1228
Implement std::hash so RCUPtr can be used as a key for maps or sets.
Definition: rcu.h:259
#define ANKERL_NANOBENCH_LOG(x)
Definition: nanobench.h:86
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Definition: nanobench.h:105
#define ANKERL_NANOBENCH(x)
Definition: nanobench.h:48
bool operator==(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:675
bool operator<(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:679
const char * name
Definition: rest.cpp:47
static RPCHelpMan stop()
Definition: server.cpp:211
Config & operator=(Config &&)
Config & operator=(Config const &)
Config(Config &&) noexcept
Config(Config const &)
static int count
Definition: tests.c:31