30#ifndef ANKERL_NANOBENCH_H_INCLUDED
31#define ANKERL_NANOBENCH_H_INCLUDED
34#define ANKERL_NANOBENCH_VERSION_MAJOR 4
35#define ANKERL_NANOBENCH_VERSION_MINOR 3
36#define ANKERL_NANOBENCH_VERSION_PATCH 6
48#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
50#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
51#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
52#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
53#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
54#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
56#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
57# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
59# define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
63# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
64 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
65# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
67# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
68# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
72# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
73# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
75# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
76# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
79#if defined(ANKERL_NANOBENCH_LOG_ENABLED)
81# define ANKERL_NANOBENCH_LOG(x) \
83 std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
86# define ANKERL_NANOBENCH_LOG(x) \
91#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
92#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
93# include <linux/version.h>
94# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0)
97# undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
98# define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
102#if defined(__clang__)
103# define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
105# define ANKERL_NANOBENCH_NO_SANITIZE(...)
109# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
111# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
116#if defined(__GNUC__) && __GNUC__ < 5
117# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
119# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
127using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
128 std::chrono::steady_clock>::type;
283void render(
char const* mustacheTemplate,
Bench const& bench, std::ostream& out);
284void render(std::string
const& mustacheTemplate,
Bench const& bench, std::ostream& out);
294void render(
char const* mustacheTemplate, std::vector<Result>
const& results, std::ostream& out);
295void render(std::string
const& mustacheTemplate, std::vector<Result>
const& results, std::ostream& out);
309char const*
csv() noexcept;
352#if ANKERL_NANOBENCH(PERF_COUNTERS)
353class LinuxPerformanceCounters;
381 std::string mBenchmarkTitle =
"benchmark";
382 std::string mBenchmarkName =
"noname";
383 std::string mUnit =
"op";
385 double mComplexityN = -1.0;
386 size_t mNumEpochs = 11;
387 size_t mClockResolutionMultiple =
static_cast<size_t>(1000);
388 std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
389 std::chrono::nanoseconds mMinEpochTime{};
390 uint64_t mMinEpochIterations{1};
391 uint64_t mEpochIterations{0};
392 uint64_t mWarmup = 0;
393 std::ostream* mOut =
nullptr;
394 std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1};
395 std::string mTimeUnitName =
"ns";
396 bool mShowPerformanceCounters =
true;
397 bool mIsRelative =
false;
434 void add(
Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
456 std::vector<std::vector<double>> mNameToMeasurements{};
484 static constexpr uint64_t(min)();
485 static constexpr uint64_t(max)();
501 Rng& operator=(
Rng&&) noexcept = default;
502 ~
Rng() noexcept = default;
529 explicit
Rng(uint64_t seed) noexcept;
530 Rng(uint64_t x, uint64_t y) noexcept;
545 inline uint64_t operator()() noexcept;
563 inline uint32_t bounded(uint32_t range) noexcept;
574 inline
double uniform01() noexcept;
583 template <typename Container>
584 void shuffle(Container& container) noexcept;
592 std::vector<uint64_t> state() const;
595 static constexpr uint64_t rotl(uint64_t x,
unsigned k) noexcept;
647 template <typename Op>
649 Bench& run(
char const* benchmarkName, Op&& op);
651 template <typename Op>
653 Bench& run(
std::
string const& benchmarkName, Op&& op);
659 template <typename Op>
668 Bench& title(
char const* benchmarkTitle);
686 template <typename T>
711 Bench& timeUnit(
std::chrono::duration<
double> const& tu,
std::
string const& tuName);
713 ANKERL_NANOBENCH(NODISCARD)
std::chrono::duration<
double> const& timeUnit() const noexcept;
745 Bench& clockResolutionMultiple(
size_t multiple) noexcept;
763 Bench& epochs(
size_t numEpochs) noexcept;
776 Bench& maxEpochTime(
std::chrono::nanoseconds t) noexcept;
789 Bench& minEpochTime(
std::chrono::nanoseconds t) noexcept;
802 Bench& minEpochIterations(uint64_t numIters) noexcept;
811 Bench& epochIterations(uint64_t numIters) noexcept;
823 Bench& warmup(uint64_t numWarmupIters) noexcept;
843 Bench& relative(
bool isRelativeEnabled) noexcept;
874 template <typename Arg>
891 template <typename T>
892 Bench& complexityN(T b) noexcept;
951 template <typename Op>
952 BigO complexityBigO(
char const*
name, Op op) const;
954 template <typename Op>
955 BigO complexityBigO(
std::
string const&
name, Op op) const;
972 std::vector<Result> mResults{};
982template <
typename Arg>
988void doNotOptimizeAwaySink(
void const*);
1001 asm volatile(
"" : :
"r,m"(val) :
"memory");
1004template <
typename T>
1006# if defined(__clang__)
1008 asm volatile(
"" :
"+r,m"(val) : :
"memory");
1011 asm volatile(
"" :
"+m,r"(val) : :
"memory");
1052#if ANKERL_NANOBENCH(PERF_COUNTERS)
1053 LinuxPerformanceCounters* mPc =
nullptr;
1069 template <
typename Op>
1071 for (
auto& rangeMeasure : data) {
1072 rangeMeasure.first = op(rangeMeasure.first);
1079 template <
typename Op>
1081 :
BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1083 template <
typename Op>
1085 :
BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1097 double mNormalizedRootMeanSquare{};
1100std::ostream&
operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO>
const& bigOs);
1108namespace nanobench {
1115 return (std::numeric_limits<uint64_t>::max)();
1119uint64_t
Rng::operator()() noexcept {
1122 mX = UINT64_C(15241094284759029579) * mY;
1123 mY = rotl(mY - x, 27);
1129uint32_t
Rng::bounded(uint32_t range) noexcept {
1130 uint64_t r32 =
static_cast<uint32_t
>(operator()());
1131 auto multiresult = r32 * range;
1132 return static_cast<uint32_t
>(multiresult >> 32U);
1136 auto i = (UINT64_C(0x3ff) << 52U) | (
operator()() >> 12U);
1140 std::memcpy(&d, &i,
sizeof(
double));
1144template <
typename Container>
1146 auto size =
static_cast<uint32_t
>(container.size());
1147 for (
auto i = size; i > 1U; --i) {
1149 auto p = bounded(i);
1150 swap(container[i - 1], container[p]);
1155constexpr uint64_t
Rng::rotl(uint64_t x,
unsigned k) noexcept {
1156 return (x << k) | (x >> (64U - k));
1159template <
typename Op>
1166 while (
auto n = iterationLogic.numIters()) {
1168 Clock::time_point before = Clock::now();
1172 Clock::time_point after = Clock::now();
1174 pc.updateResults(iterationLogic.numIters());
1175 iterationLogic.
add(after - before, pc);
1182template <
typename Op>
1184 name(benchmarkName);
1185 return run(std::forward<Op>(op));
1188template <
typename Op>
1190 name(benchmarkName);
1191 return run(std::forward<Op>(op));
1194template <
typename Op>
1199template <
typename Op>
1206template <
typename T>
1208 mConfig.mBatch =
static_cast<double>(b);
1213template <
typename T>
1215 mConfig.mComplexityN =
static_cast<double>(n);
1220template <
typename Arg>
1227template <
typename Arg>
1234#if defined(_MSC_VER)
1235template <
typename T>
1237 doNotOptimizeAwaySink(&val);
1246#if defined(ANKERL_NANOBENCH_IMPLEMENT)
1252# include <algorithm>
1262# include <stdexcept>
1264# if defined(__linux__)
1267# if ANKERL_NANOBENCH(PERF_COUNTERS)
1270# include <linux/perf_event.h>
1271# include <sys/ioctl.h>
1272# include <sys/syscall.h>
1279namespace nanobench {
1290class StreamStateRestorer;
1292class MarkDownColumn;
1303namespace nanobench {
1305uint64_t splitMix64(uint64_t& state)
noexcept;
1310template <
typename T>
1311inline double d(T t)
noexcept {
1312 return static_cast<double>(t);
1314inline double d(Clock::duration duration)
noexcept {
1315 return std::chrono::duration_cast<std::chrono::duration<double>>(duration).
count();
1319inline Clock::duration clockResolution() noexcept;
1323namespace templates {
1325char const*
csv() noexcept {
1326 return R
"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1327{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1332 return R
"DELIM(<html>
1335 <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1339 <div id="myDiv"></div>
1344 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1348 var title = '{{title}}';
1350 data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1351 var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1358char const*
pyperf() noexcept {
1365{{#measurement}} {{elapsed}}{{^-last}},
1366{{/last}}{{/measurement}}
1373 "loops": {{sum(iterations)}},
1374 "inner_loops": {{batch}},
1375 "name": "{{title}}",
1382char const*
json() noexcept {
1386 "title": "{{title}}",
1390 "complexityN": {{complexityN}},
1391 "epochs": {{epochs}},
1392 "clockResolution": {{clockResolution}},
1393 "clockResolutionMultiple": {{clockResolutionMultiple}},
1394 "maxEpochTime": {{maxEpochTime}},
1395 "minEpochTime": {{minEpochTime}},
1396 "minEpochIterations": {{minEpochIterations}},
1397 "epochIterations": {{epochIterations}},
1398 "warmup": {{warmup}},
1399 "relative": {{relative}},
1400 "median(elapsed)": {{median(elapsed)}},
1401 "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1402 "median(instructions)": {{median(instructions)}},
1403 "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1404 "median(cpucycles)": {{median(cpucycles)}},
1405 "median(contextswitches)": {{median(contextswitches)}},
1406 "median(pagefaults)": {{median(pagefaults)}},
1407 "median(branchinstructions)": {{median(branchinstructions)}},
1408 "median(branchmisses)": {{median(branchmisses)}},
1409 "totalTime": {{sumProduct(iterations, elapsed)}},
1412 "iterations": {{iterations}},
1413 "elapsed": {{elapsed}},
1414 "pagefaults": {{pagefaults}},
1415 "cpucycles": {{cpucycles}},
1416 "contextswitches": {{contextswitches}},
1417 "instructions": {{instructions}},
1418 "branchinstructions": {{branchinstructions}},
1419 "branchmisses": {{branchmisses}}
1420 }{{^-last}},{{/-last}}
1422 }{{^-last}},{{/-last}}
1429 enum class Type { tag, content, section, inverted_section };
1433 std::vector<Node> children;
1438 bool operator==(
char const (&str)[N])
const noexcept {
1439 return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1444static std::vector<Node> parseMustacheTemplate(
char const** tpl) {
1445 std::vector<Node> nodes;
1448 auto begin = std::strstr(*tpl,
"{{");
1450 if (begin !=
nullptr) {
1452 end = std::strstr(begin,
"}}");
1455 if (begin ==
nullptr || end ==
nullptr) {
1457 nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1461 nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1471 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1475 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1479 nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1485static bool generateFirstLast(Node
const& n,
size_t idx,
size_t size, std::ostream& out) {
1487 bool matchFirst = n ==
"-first";
1488 bool matchLast = n ==
"-last";
1489 if (!matchFirst && !matchLast) {
1493 bool doWrite =
false;
1494 if (n.type == Node::Type::section) {
1495 doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1496 }
else if (n.type == Node::Type::inverted_section) {
1497 doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1501 for (
auto const& child : n.children) {
1502 if (child.type == Node::Type::content) {
1503 out.write(child.begin, std::distance(child.begin, child.end));
1510static bool matchCmdArgs(std::string
const& str, std::vector<std::string>& matchResult) {
1511 matchResult.clear();
1512 auto idxOpen = str.find(
'(');
1513 auto idxClose = str.find(
')', idxOpen);
1514 if (idxClose == std::string::npos) {
1518 matchResult.emplace_back(str.substr(0, idxOpen));
1521 matchResult.emplace_back(std::string{});
1522 for (
size_t i = idxOpen + 1; i != idxClose; ++i) {
1523 if (str[i] ==
' ' || str[i] ==
'\t') {
1527 if (str[i] ==
',') {
1529 matchResult.emplace_back(std::string{});
1533 matchResult.back() += str[i];
1538static bool generateConfigTag(Node
const& n,
Config const& config, std::ostream& out) {
1542 out << config.mBenchmarkTitle;
1544 }
else if (n ==
"name") {
1545 out << config.mBenchmarkName;
1547 }
else if (n ==
"unit") {
1548 out << config.mUnit;
1550 }
else if (n ==
"batch") {
1551 out << config.mBatch;
1553 }
else if (n ==
"complexityN") {
1554 out << config.mComplexityN;
1556 }
else if (n ==
"epochs") {
1557 out << config.mNumEpochs;
1559 }
else if (n ==
"clockResolution") {
1560 out << d(detail::clockResolution());
1562 }
else if (n ==
"clockResolutionMultiple") {
1563 out << config.mClockResolutionMultiple;
1565 }
else if (n ==
"maxEpochTime") {
1566 out << d(config.mMaxEpochTime);
1568 }
else if (n ==
"minEpochTime") {
1569 out << d(config.mMinEpochTime);
1571 }
else if (n ==
"minEpochIterations") {
1572 out << config.mMinEpochIterations;
1574 }
else if (n ==
"epochIterations") {
1575 out << config.mEpochIterations;
1577 }
else if (n ==
"warmup") {
1578 out << config.mWarmup;
1580 }
else if (n ==
"relative") {
1581 out << config.mIsRelative;
1587static std::ostream& generateResultTag(Node
const& n, Result
const& r, std::ostream& out) {
1588 if (generateConfigTag(n, r.config(), out)) {
1596 std::vector<std::string> matchResult;
1597 if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1598 if (matchResult.size() == 2) {
1604 if (matchResult[0] ==
"median") {
1605 return out << r.median(m);
1607 if (matchResult[0] ==
"average") {
1608 return out << r.average(m);
1610 if (matchResult[0] ==
"medianAbsolutePercentError") {
1611 return out << r.medianAbsolutePercentError(m);
1613 if (matchResult[0] ==
"sum") {
1614 return out << r.sum(m);
1616 if (matchResult[0] ==
"minimum") {
1617 return out << r.minimum(m);
1619 if (matchResult[0] ==
"maximum") {
1620 return out << r.maximum(m);
1622 }
else if (matchResult.size() == 3) {
1629 if (matchResult[0] ==
"sumProduct") {
1630 return out << r.sumProduct(m1, m2);
1639 throw std::runtime_error(
"command '" + std::string(n.begin, n.end) +
"' not understood");
1642static void generateResultMeasurement(std::vector<Node>
const& nodes,
size_t idx, Result
const& r, std::ostream& out) {
1643 for (
auto const& n : nodes) {
1644 if (!generateFirstLast(n, idx, r.size(), out)) {
1647 case Node::Type::content:
1648 out.write(n.begin, std::distance(n.begin, n.end));
1651 case Node::Type::inverted_section:
1652 throw std::runtime_error(
"got a inverted section inside measurement");
1654 case Node::Type::section:
1655 throw std::runtime_error(
"got a section inside measurement");
1657 case Node::Type::tag: {
1662 out << r.get(idx, m);
1671static void generateResult(std::vector<Node>
const& nodes,
size_t idx, std::vector<Result>
const& results, std::ostream& out) {
1672 auto const& r = results[idx];
1673 for (
auto const& n : nodes) {
1674 if (!generateFirstLast(n, idx, results.size(), out)) {
1677 case Node::Type::content:
1678 out.write(n.begin, std::distance(n.begin, n.end));
1681 case Node::Type::inverted_section:
1682 throw std::runtime_error(
"got a inverted section inside result");
1684 case Node::Type::section:
1685 if (n ==
"measurement") {
1686 for (
size_t i = 0; i < r.size(); ++i) {
1687 generateResultMeasurement(n.children, i, r, out);
1690 throw std::runtime_error(
"got a section inside result");
1694 case Node::Type::tag:
1695 generateResultTag(n, r, out);
1707char const* getEnv(
char const*
name);
1708bool isEndlessRunning(std::string
const&
name);
1709bool isWarningsEnabled();
1711template <
typename T>
1712T parseFile(std::string
const& filename);
1714void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1715void printStabilityInformationOnce(std::ostream* os);
1718uint64_t& singletonHeaderHash() noexcept;
1721Clock::duration calcClockResolution(
size_t numEvaluations) noexcept;
1728class NumSep :
public std::numpunct<char> {
1730 explicit NumSep(
char sep);
1731 char do_thousands_sep()
const override;
1732 std::string do_grouping()
const override;
1741class StreamStateRestorer {
1743 explicit StreamStateRestorer(std::ostream& s);
1744 ~StreamStateRestorer();
1750 StreamStateRestorer(StreamStateRestorer
const&) =
delete;
1751 StreamStateRestorer& operator=(StreamStateRestorer
const&) =
delete;
1752 StreamStateRestorer(StreamStateRestorer&&) =
delete;
1753 StreamStateRestorer& operator=(StreamStateRestorer&&) =
delete;
1756 std::ostream& mStream;
1757 std::locale mLocale;
1758 std::streamsize
const mPrecision;
1759 std::streamsize
const mWidth;
1760 std::ostream::char_type
const mFill;
1761 std::ostream::fmtflags
const mFmtFlags;
1768 Number(
int width,
int precision,
double value);
1769 Number(
int width,
int precision, int64_t value);
1770 std::string to_s()
const;
1773 friend std::ostream&
operator<<(std::ostream& os, Number
const& n);
1774 std::ostream& write(std::ostream& os)
const;
1782std::string to_s(uint64_t s);
1784std::ostream&
operator<<(std::ostream& os, Number
const& n);
1786class MarkDownColumn {
1788 MarkDownColumn(
int w,
int prec, std::string
const& tit, std::string
const& suff,
double val);
1789 std::string title()
const;
1790 std::string separator()
const;
1791 std::string invalid()
const;
1792 std::string value()
const;
1798 std::string mSuffix;
1805 explicit MarkDownCode(std::string
const& what);
1808 friend std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode);
1809 std::ostream& write(std::ostream& os)
const;
1811 std::string mWhat{};
1814std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode);
1824namespace nanobench {
1826void render(
char const* mustacheTemplate, std::vector<Result>
const& results, std::ostream& out) {
1827 detail::fmt::StreamStateRestorer restorer(out);
1829 out.precision(std::numeric_limits<double>::digits10);
1830 auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1832 for (
auto const& n : nodes) {
1835 case templates::Node::Type::content:
1836 out.write(n.begin, std::distance(n.begin, n.end));
1839 case templates::Node::Type::inverted_section:
1840 throw std::runtime_error(
"unknown list '" + std::string(n.begin, n.end) +
"'");
1842 case templates::Node::Type::section:
1843 if (n ==
"result") {
1844 const size_t nbResults = results.size();
1845 for (
size_t i = 0; i < nbResults; ++i) {
1846 generateResult(n.children, i, results, out);
1848 }
else if (n ==
"measurement") {
1849 if (results.size() != 1) {
1850 throw std::runtime_error(
1851 "render: can only use section 'measurement' here if there is a single result, but there are " +
1852 detail::fmt::to_s(results.size()));
1855 auto const& r = results.front();
1856 for (
size_t i = 0; i < r.size(); ++i) {
1857 generateResultMeasurement(n.children, i, r, out);
1860 throw std::runtime_error(
"render: unknown section '" + std::string(n.begin, n.end) +
"'");
1864 case templates::Node::Type::tag:
1865 if (results.size() == 1) {
1867 generateResultTag(n, results.front(), out);
1870 if (!generateConfigTag(n, results.back().config(), out)) {
1871 throw std::runtime_error(
"unknown tag '" + std::string(n.begin, n.end) +
"'");
1879void render(std::string
const& mustacheTemplate, std::vector<Result>
const& results, std::ostream& out) {
1880 render(mustacheTemplate.c_str(), results, out);
1883void render(
char const* mustacheTemplate,
const Bench& bench, std::ostream& out) {
1884 render(mustacheTemplate, bench.results(), out);
1887void render(std::string
const& mustacheTemplate,
const Bench& bench, std::ostream& out) {
1888 render(mustacheTemplate.c_str(), bench.results(), out);
1894# if defined(__clang__)
1895# pragma clang diagnostic push
1896# pragma clang diagnostic ignored "-Wexit-time-destructors"
1898 static PerformanceCounters pc;
1899# if defined(__clang__)
1900# pragma clang diagnostic pop
1909# if defined(_MSC_VER)
1910# pragma optimize("", off)
1911void doNotOptimizeAwaySink(
void const*) {}
1912# pragma optimize("", on)
1915template <
typename T>
1916T parseFile(std::string
const& filename) {
1917 std::ifstream fin(filename);
1923char const* getEnv(
char const*
name) {
1924# if defined(_MSC_VER)
1925# pragma warning(push)
1926# pragma warning(disable : 4996)
1928 return std::getenv(
name);
1929# if defined(_MSC_VER)
1930# pragma warning(pop)
1934bool isEndlessRunning(std::string
const&
name) {
1935 auto endless = getEnv(
"NANOBENCH_ENDLESS");
1936 return nullptr != endless && endless ==
name;
1940bool isWarningsEnabled() {
1941 auto suppression = getEnv(
"NANOBENCH_SUPPRESS_WARNINGS");
1942 return nullptr == suppression || suppression == std::string(
"0");
1945void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
1947 recommendations.clear();
1949 bool recommendCheckFlags =
false;
1952 warnings.emplace_back(
"DEBUG defined");
1953 recommendCheckFlags =
true;
1956 bool recommendPyPerf =
false;
1957# if defined(__linux__)
1958 auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
1960 warnings.emplace_back(
"couldn't figure out number of processors - no governor, turbo check possible");
1964 for (
long id = 0;
id < nprocs; ++id) {
1965 auto idStr = detail::fmt::to_s(
static_cast<uint64_t
>(
id));
1966 auto sysCpu =
"/sys/devices/system/cpu/cpu" + idStr;
1967 auto minFreq = parseFile<int64_t>(sysCpu +
"/cpufreq/scaling_min_freq");
1968 auto maxFreq = parseFile<int64_t>(sysCpu +
"/cpufreq/scaling_max_freq");
1969 if (minFreq != maxFreq) {
1970 auto minMHz =
static_cast<double>(minFreq) / 1000.0;
1971 auto maxMHz =
static_cast<double>(maxFreq) / 1000.0;
1972 warnings.emplace_back(
"CPU frequency scaling enabled: CPU " + idStr +
" between " +
1973 detail::fmt::Number(1, 1, minMHz).to_s() +
" and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
1975 recommendPyPerf =
true;
1980 auto currentGovernor = parseFile<std::string>(
"/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
1981 if (
"performance" != currentGovernor) {
1982 warnings.emplace_back(
"CPU governor is '" + currentGovernor +
"' but should be 'performance'");
1983 recommendPyPerf =
true;
1986 if (0 == parseFile<int>(
"/sys/devices/system/cpu/intel_pstate/no_turbo")) {
1987 warnings.emplace_back(
"Turbo is enabled, CPU frequency will fluctuate");
1988 recommendPyPerf =
true;
1993 if (recommendCheckFlags) {
1994 recommendations.emplace_back(
"Make sure you compile for Release");
1996 if (recommendPyPerf) {
1997 recommendations.emplace_back(
"Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
2001void printStabilityInformationOnce(std::ostream* outStream) {
2002 static bool shouldPrint =
true;
2003 if (shouldPrint && outStream && isWarningsEnabled()) {
2004 auto& os = *outStream;
2005 shouldPrint =
false;
2006 std::vector<std::string> warnings;
2007 std::vector<std::string> recommendations;
2008 gatherStabilityInformation(warnings, recommendations);
2009 if (warnings.empty()) {
2013 os <<
"Warning, results might be unstable:" << std::endl;
2014 for (
auto const& w : warnings) {
2015 os <<
"* " << w << std::endl;
2018 os << std::endl <<
"Recommendations" << std::endl;
2019 for (
auto const& r : recommendations) {
2020 os <<
"* " << r << std::endl;
2026uint64_t& singletonHeaderHash() noexcept {
2027 static uint64_t sHeaderHash{};
2032inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2033 return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2037Clock::duration calcClockResolution(
size_t numEvaluations)
noexcept {
2038 auto bestDuration = Clock::duration::max();
2039 Clock::time_point tBegin;
2040 Clock::time_point tEnd;
2041 for (
size_t i = 0; i < numEvaluations; ++i) {
2042 tBegin = Clock::now();
2044 tEnd = Clock::now();
2045 }
while (tBegin == tEnd);
2046 bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2048 return bestDuration;
2052Clock::duration clockResolution() noexcept {
2053 static Clock::duration sResolution = calcClockResolution(20);
2058struct IterationLogic::Impl {
2059 enum class State { warmup, upscaling_runtime, measuring, endless };
2061 explicit Impl(Bench
const& bench)
2063 , mResult(bench.config()) {
2064 printStabilityInformationOnce(mBench.output());
2067 mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2068 if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2069 mTargetRuntimePerEpoch = mBench.maxEpochTime();
2071 if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2072 mTargetRuntimePerEpoch = mBench.minEpochTime();
2075 if (isEndlessRunning(mBench.name())) {
2076 std::cerr <<
"NANOBENCH_ENDLESS set: running '" << mBench.name() <<
"' endlessly" << std::endl;
2077 mNumIters = (std::numeric_limits<uint64_t>::max)();
2078 mState = State::endless;
2079 }
else if (0 != mBench.warmup()) {
2080 mNumIters = mBench.warmup();
2081 mState = State::warmup;
2082 }
else if (0 != mBench.epochIterations()) {
2084 mNumIters = mBench.epochIterations();
2085 mState = State::measuring;
2087 mNumIters = mBench.minEpochIterations();
2088 mState = State::upscaling_runtime;
2093 ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters)
noexcept {
2094 auto doubleElapsed = d(elapsed);
2095 auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2096 auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2098 auto doubleMinEpochIters = d(mBench.minEpochIterations());
2099 if (doubleNewIters < doubleMinEpochIters) {
2100 doubleNewIters = doubleMinEpochIters;
2102 doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2106 return static_cast<uint64_t
>(doubleNewIters + 0.5);
2110 if (elapsed * 10 < mTargetRuntimePerEpoch) {
2112 if (mNumIters * 10 < mNumIters) {
2114 showResult(
"iterations overflow. Maybe your code got optimized away?");
2120 mNumIters = calcBestNumIters(elapsed, mNumIters);
2124 void add(std::chrono::nanoseconds elapsed, PerformanceCounters
const& pc)
noexcept {
2125# if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2126 auto oldIters = mNumIters;
2131 if (isCloseEnoughForMeasurements(elapsed)) {
2134 mState = State::measuring;
2135 mNumIters = calcBestNumIters(elapsed, mNumIters);
2138 mState = State::upscaling_runtime;
2143 case State::upscaling_runtime:
2144 if (isCloseEnoughForMeasurements(elapsed)) {
2146 mState = State::measuring;
2147 mTotalElapsed += elapsed;
2148 mTotalNumIters += mNumIters;
2149 mResult.add(elapsed, mNumIters, pc);
2150 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2156 case State::measuring:
2159 mTotalElapsed += elapsed;
2160 mTotalNumIters += mNumIters;
2161 mResult.add(elapsed, mNumIters, pc);
2162 if (0 != mBench.epochIterations()) {
2163 mNumIters = mBench.epochIterations();
2165 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2169 case State::endless:
2170 mNumIters = (std::numeric_limits<uint64_t>::max)();
2174 if (
static_cast<uint64_t
>(mResult.size()) == mBench.epochs()) {
2180 ANKERL_NANOBENCH_LOG(mBench.name() <<
": " << detail::fmt::Number(20, 3,
static_cast<double>(elapsed.count())) <<
" elapsed, "
2181 << detail::fmt::Number(20, 3,
static_cast<double>(mTargetRuntimePerEpoch.count()))
2182 <<
" target. oldIters=" << oldIters <<
", mNumIters=" << mNumIters
2183 <<
", mState=" <<
static_cast<int>(mState));
2186 void showResult(std::string
const& errorMessage)
const {
2189 if (mBench.output() !=
nullptr) {
2191 std::vector<fmt::MarkDownColumn> columns;
2195 if (mBench.relative()) {
2197 if (!mBench.results().empty()) {
2200 columns.emplace_back(11, 1,
"relative",
"%", d);
2203 if (mBench.complexityN() > 0) {
2204 columns.emplace_back(14, 0,
"complexityN",
"", mBench.complexityN());
2207 columns.emplace_back(22, 2, mBench.timeUnitName() +
"/" + mBench.unit(),
"",
2208 rMedian / (mBench.timeUnit().count() * mBench.batch()));
2209 columns.emplace_back(22, 2, mBench.unit() +
"/s",
"", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2212 columns.emplace_back(10, 1,
"err%",
"%", rErrorMedian * 100.0);
2214 double rInsMedian = -1.0;
2217 columns.emplace_back(18, 2,
"ins/" + mBench.unit(),
"", rInsMedian / mBench.batch());
2220 double rCycMedian = -1.0;
2223 columns.emplace_back(18, 2,
"cyc/" + mBench.unit(),
"", rCycMedian / mBench.batch());
2225 if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2226 columns.emplace_back(9, 3,
"IPC",
"", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2230 columns.emplace_back(17, 2,
"bra/" + mBench.unit(),
"", rBraMedian / mBench.batch());
2233 if (rBraMedian >= 1e-9) {
2236 columns.emplace_back(10, 1,
"miss%",
"%", p);
2243 auto& os = *mBench.output();
2247 hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2248 hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2249 hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2250 hash = hash_combine(std::hash<double>{}(mBench.timeUnit().
count()), hash);
2251 hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2252 hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2254 if (hash != singletonHeaderHash()) {
2255 singletonHeaderHash() = hash;
2259 for (
auto const& col : columns) {
2262 os <<
"| " << mBench.title() << std::endl;
2264 for (
auto const& col : columns) {
2265 os << col.separator();
2267 os <<
"|:" << std::string(mBench.title().size() + 1U,
'-') << std::endl;
2270 if (!errorMessage.empty()) {
2271 for (
auto const& col : columns) {
2272 os << col.invalid();
2274 os <<
"| :boom: " << fmt::MarkDownCode(mBench.name()) <<
" (" << errorMessage <<
')' << std::endl;
2276 for (
auto const& col : columns) {
2280 auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2282 os <<
":wavy_dash: ";
2284 os << fmt::MarkDownCode(mBench.name());
2286 auto avgIters =
static_cast<double>(mTotalNumIters) /
static_cast<double>(mBench.epochs());
2288 auto suggestedIters =
static_cast<uint64_t
>(avgIters * 10 + 0.5);
2290 os <<
" (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2291 <<
" iters. Increase `minEpochIterations` to e.g. " << suggestedIters <<
")";
2298 ANKERL_NANOBENCH(NODISCARD)
bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed)
const noexcept {
2299 return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2302 uint64_t mNumIters = 1;
2303 Bench
const& mBench;
2304 std::chrono::nanoseconds mTargetRuntimePerEpoch{};
2307 std::chrono::nanoseconds mTotalElapsed{};
2308 uint64_t mTotalNumIters = 0;
2310 State mState = State::upscaling_runtime;
2314IterationLogic::IterationLogic(Bench
const& bench) noexcept
2315 : mPimpl(
new Impl(bench)) {}
2317IterationLogic::~IterationLogic() {
2323uint64_t IterationLogic::numIters() const noexcept {
2325 return mPimpl->mNumIters;
2328void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters
const& pc)
noexcept {
2329 mPimpl->add(elapsed, pc);
2332void IterationLogic::moveResultTo(std::vector<Result>& results)
noexcept {
2333 results.emplace_back(std::move(mPimpl->mResult));
2336# if ANKERL_NANOBENCH(PERF_COUNTERS)
2339class LinuxPerformanceCounters {
2342 Target(uint64_t* targetValue_,
bool correctMeasuringOverhead_,
bool correctLoopOverhead_)
2343 : targetValue(targetValue_)
2344 , correctMeasuringOverhead(correctMeasuringOverhead_)
2345 , correctLoopOverhead(correctLoopOverhead_) {}
2347 uint64_t* targetValue{};
2348 bool correctMeasuringOverhead{};
2349 bool correctLoopOverhead{};
2352 ~LinuxPerformanceCounters();
2355 inline void start() {}
2357 inline void stop() {}
2359 bool monitor(perf_sw_ids swId, Target target);
2360 bool monitor(perf_hw_id hwId, Target target);
2362 bool hasError() const noexcept {
2368 inline void beginMeasure() {
2374 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2380 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2383 inline void endMeasure() {
2389 mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2394 auto const numBytes =
sizeof(uint64_t) * mCounters.size();
2395 auto ret = read(mFd, mCounters.data(), numBytes);
2396 mHasError = ret !=
static_cast<ssize_t
>(numBytes);
2399 void updateResults(uint64_t numIters);
2402 template <
typename T>
2403 static inline T divRounded(T a, T divisor) {
2404 return (a + divisor / 2) / divisor;
2408 static inline uint32_t mix(uint32_t x) noexcept {
2415 template <
typename Op>
2417 void calibrate(Op&& op) {
2419 for (
auto& v : mCalibratedOverhead) {
2424 auto newCalibration = mCalibratedOverhead;
2425 for (
auto& v : newCalibration) {
2426 v = (std::numeric_limits<uint64_t>::max)();
2428 for (
size_t iter = 0; iter < 100; ++iter) {
2436 for (
size_t i = 0; i < newCalibration.size(); ++i) {
2437 auto diff = mCounters[i];
2438 if (newCalibration[i] > diff) {
2439 newCalibration[i] = diff;
2444 mCalibratedOverhead = std::move(newCalibration);
2451 uint64_t
const numIters = 100000U + (std::random_device{}() & 3);
2452 uint64_t n = numIters;
2453 uint32_t x = 1234567;
2461 auto measure1 = mCounters;
2472 auto measure2 = mCounters;
2474 for (
size_t i = 0; i < mCounters.size(); ++i) {
2476 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2477 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2478 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2480 mLoopOverhead[i] = divRounded(overhead, numIters);
2486 bool monitor(uint32_t type, uint64_t eventid, Target target);
2488 std::map<uint64_t, Target> mIdToTarget{};
2491 std::vector<uint64_t> mCounters{3};
2492 std::vector<uint64_t> mCalibratedOverhead{3};
2493 std::vector<uint64_t> mLoopOverhead{3};
2495 uint64_t mTimeEnabledNanos = 0;
2496 uint64_t mTimeRunningNanos = 0;
2498 bool mHasError =
false;
2502LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2508bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2509 return monitor(PERF_TYPE_SOFTWARE, swId, target);
2512bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2513 return monitor(PERF_TYPE_HARDWARE, hwId, target);
2518void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2520 for (
auto& id_value : mIdToTarget) {
2521 *id_value.second.targetValue = UINT64_C(0);
2528 mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2529 mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2531 for (uint64_t i = 0; i < mCounters[0]; ++i) {
2532 auto idx =
static_cast<size_t>(3 + i * 2 + 0);
2533 auto id = mCounters[idx + 1U];
2535 auto it = mIdToTarget.find(
id);
2536 if (it != mIdToTarget.end()) {
2538 auto& tgt = it->second;
2539 *tgt.targetValue = mCounters[idx];
2540 if (tgt.correctMeasuringOverhead) {
2541 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2542 *tgt.targetValue -= mCalibratedOverhead[idx];
2544 *tgt.targetValue = 0U;
2547 if (tgt.correctLoopOverhead) {
2548 auto correctionVal = mLoopOverhead[idx] * numIters;
2549 if (*tgt.targetValue >= correctionVal) {
2550 *tgt.targetValue -= correctionVal;
2552 *tgt.targetValue = 0U;
2559bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2560 *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2565 auto pea = perf_event_attr();
2566 std::memset(&pea, 0,
sizeof(perf_event_attr));
2568 pea.size =
sizeof(perf_event_attr);
2569 pea.config = eventid;
2571 pea.exclude_kernel = 1;
2575 pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2579# if defined(PERF_FLAG_FD_CLOEXEC)
2580 const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2582 const unsigned long flags = 0;
2585 auto fd =
static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd,
flags));
2595 if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &
id)) {
2601 mIdToTarget.emplace(
id, target);
2604 auto size = 3 + 2 * mIdToTarget.size();
2605 mCounters.resize(size);
2606 mCalibratedOverhead.resize(size);
2607 mLoopOverhead.resize(size);
2612PerformanceCounters::PerformanceCounters()
2613 : mPc(new LinuxPerformanceCounters())
2617 mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults,
true,
false));
2618 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles,
true,
false));
2619 mHas.contextSwitches =
2620 mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches,
true,
false));
2621 mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions,
true,
true));
2622 mHas.branchInstructions =
2623 mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions,
true,
false));
2624 mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses,
true,
false));
2629 auto before = ankerl::nanobench::Clock::now();
2630 auto after = ankerl::nanobench::Clock::now();
2635 if (mPc->hasError()) {
2637 mHas = PerfCountSet<bool>{};
2641PerformanceCounters::~PerformanceCounters() {
2642 if (
nullptr != mPc) {
2647void PerformanceCounters::beginMeasure() {
2648 mPc->beginMeasure();
2651void PerformanceCounters::endMeasure() {
2655void PerformanceCounters::updateResults(uint64_t numIters) {
2656 mPc->updateResults(numIters);
2661PerformanceCounters::PerformanceCounters() =
default;
2662PerformanceCounters::~PerformanceCounters() =
default;
2663void PerformanceCounters::beginMeasure() {}
2664void PerformanceCounters::endMeasure() {}
2665void PerformanceCounters::updateResults(uint64_t) {}
2669ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t>
const& PerformanceCounters::val() const noexcept {
2672ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool>
const& PerformanceCounters::has() const noexcept {
2680NumSep::NumSep(
char sep)
2683char NumSep::do_thousands_sep()
const {
2687std::string NumSep::do_grouping()
const {
2692StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2694 , mLocale(s.getloc())
2695 , mPrecision(s.precision())
2698 , mFmtFlags(s.
flags()) {}
2700StreamStateRestorer::~StreamStateRestorer() {
2705void StreamStateRestorer::restore() {
2706 mStream.imbue(mLocale);
2707 mStream.precision(mPrecision);
2708 mStream.width(mWidth);
2709 mStream.fill(mFill);
2710 mStream.flags(mFmtFlags);
2713Number::Number(
int width,
int precision, int64_t value)
2715 , mPrecision(precision)
2716 , mValue(static_cast<double>(value)) {}
2718Number::Number(
int width,
int precision,
double value)
2720 , mPrecision(precision)
2723std::ostream& Number::write(std::ostream& os)
const {
2724 StreamStateRestorer restorer(os);
2725 os.imbue(std::locale(os.getloc(),
new NumSep(
',')));
2726 os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2730std::string Number::to_s()
const {
2731 std::stringstream ss;
2736std::string to_s(uint64_t n) {
2739 str +=
static_cast<char>(
'0' +
static_cast<char>(n % 10));
2742 std::reverse(str.begin(), str.end());
2746std::ostream&
operator<<(std::ostream& os, Number
const& n) {
2750MarkDownColumn::MarkDownColumn(
int w,
int prec, std::string
const& tit, std::string
const& suff,
double val)
2757std::string MarkDownColumn::title()
const {
2758 std::stringstream ss;
2759 ss <<
'|' << std::setw(mWidth - 2) << std::right << mTitle <<
' ';
2763std::string MarkDownColumn::separator()
const {
2764 std::string sep(
static_cast<size_t>(mWidth),
'-');
2770std::string MarkDownColumn::invalid()
const {
2771 std::string sep(
static_cast<size_t>(mWidth),
' ');
2773 sep[sep.size() - 2] =
'-';
2777std::string MarkDownColumn::value()
const {
2778 std::stringstream ss;
2779 auto width = mWidth - 2 -
static_cast<int>(mSuffix.size());
2780 ss <<
'|' << Number(width, mPrecision, mValue) << mSuffix <<
' ';
2785MarkDownCode::MarkDownCode(std::string
const& what) {
2786 mWhat.reserve(what.size() + 2);
2787 mWhat.push_back(
'`');
2788 for (
char c : what) {
2791 mWhat.push_back(
'`');
2794 mWhat.push_back(
'`');
2797std::ostream& MarkDownCode::write(std::ostream& os)
const {
2801std::ostream&
operator<<(std::ostream& os, MarkDownCode
const& mdCode) {
2802 return mdCode.write(os);
2809Config::~Config() =
default;
2816Result::~Result() = default;
2817Result& Result::operator=(Result const&) = default;
2818Result& Result::operator=(Result&&) = default;
2819Result::Result(Result const&) = default;
2820Result::Result(Result&&) noexcept = default;
2823template <
typename T>
2824inline constexpr typename std::underlying_type<T>::type u(T val)
noexcept {
2825 return static_cast<typename std::underlying_type<T>::type
>(val);
2830Result::Result(
Config const& benchmarkConfig)
2831 : mConfig(benchmarkConfig)
2832 , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2834void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters
const& pc) {
2838 double dIters = d(iters);
2839 mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2841 mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2842 if (pc.has().pageFaults) {
2843 mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2845 if (pc.has().cpuCycles) {
2846 mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2848 if (pc.has().contextSwitches) {
2849 mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2851 if (pc.has().instructions) {
2852 mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2854 if (pc.has().branchInstructions) {
2855 double branchInstructions = 0.0;
2857 if (pc.val().branchInstructions > iters + 1U) {
2858 branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2860 mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2862 if (pc.has().branchMisses) {
2864 double branchMisses = d(pc.val().branchMisses);
2865 if (branchMisses > branchInstructions) {
2867 branchMisses = branchInstructions;
2871 branchMisses -= 1.0;
2872 if (branchMisses < 1.0) {
2875 mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2880Config const& Result::config() const noexcept {
2884inline double calcMedian(std::vector<double>& data) {
2888 std::sort(data.begin(), data.end());
2890 auto midIdx = data.size() / 2U;
2891 if (1U == (data.size() & 1U)) {
2892 return data[midIdx];
2894 return (data[midIdx - 1U] + data[midIdx]) / 2U;
2897double Result::median(Measure m)
const {
2899 auto data = mNameToMeasurements[detail::u(m)];
2900 return calcMedian(data);
2903double Result::average(Measure m)
const {
2905 auto const& data = mNameToMeasurements[detail::u(m)];
2911 return sum(m) / d(data.size());
2914double Result::medianAbsolutePercentError(Measure m)
const {
2916 auto data = mNameToMeasurements[detail::u(m)];
2920 auto med = calcMedian(data);
2923 for (
auto& x : data) {
2929 return calcMedian(data);
2933 auto const& data = mNameToMeasurements[detail::u(m)];
2934 return std::accumulate(data.begin(), data.end(), 0.0);
2937double Result::sumProduct(Measure m1, Measure m2)
const noexcept {
2938 auto const& data1 = mNameToMeasurements[detail::u(m1)];
2939 auto const& data2 = mNameToMeasurements[detail::u(m2)];
2941 if (data1.size() != data2.size()) {
2945 double result = 0.0;
2946 for (
size_t i = 0, s = data1.size(); i != s; ++i) {
2947 result += data1[i] * data2[i];
2952bool Result::has(Measure m)
const noexcept {
2953 return !mNameToMeasurements[detail::u(m)].empty();
2956double Result::get(
size_t idx, Measure m)
const {
2957 auto const& data = mNameToMeasurements[detail::u(m)];
2958 return data.at(idx);
2961bool Result::empty() const noexcept {
2962 return 0U == size();
2965size_t Result::size() const noexcept {
2966 auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
2970double Result::minimum(Measure m)
const noexcept {
2971 auto const& data = mNameToMeasurements[detail::u(m)];
2977 return *std::min_element(data.begin(), data.end());
2980double Result::maximum(Measure m)
const noexcept {
2981 auto const& data = mNameToMeasurements[detail::u(m)];
2987 return *std::max_element(data.begin(), data.end());
2990Result::Measure Result::fromString(std::string
const& str) {
2991 if (str ==
"elapsed") {
2992 return Measure::elapsed;
2993 }
else if (str ==
"iterations") {
2994 return Measure::iterations;
2995 }
else if (str ==
"pagefaults") {
2996 return Measure::pagefaults;
2997 }
else if (str ==
"cpucycles") {
2998 return Measure::cpucycles;
2999 }
else if (str ==
"contextswitches") {
3000 return Measure::contextswitches;
3001 }
else if (str ==
"instructions") {
3002 return Measure::instructions;
3003 }
else if (str ==
"branchinstructions") {
3004 return Measure::branchinstructions;
3005 }
else if (str ==
"branchmisses") {
3006 return Measure::branchmisses;
3009 return Measure::_size;
3015 mConfig.mOut = &std::cout;
3018Bench::Bench(Bench&&) =
default;
3019Bench& Bench::operator=(Bench&&) =
default;
3020Bench::Bench(Bench
const&) =
default;
3021Bench& Bench::operator=(Bench
const&) =
default;
3022Bench::~Bench() noexcept = default;
3024double Bench::batch() const noexcept {
3025 return mConfig.mBatch;
3028double Bench::complexityN() const noexcept {
3029 return mConfig.mComplexityN;
3034Bench& Bench::relative(
bool isRelativeEnabled)
noexcept {
3035 mConfig.mIsRelative = isRelativeEnabled;
3038bool Bench::relative() const noexcept {
3039 return mConfig.mIsRelative;
3043 mConfig.mShowPerformanceCounters = showPerformanceCounters;
3047 return mConfig.mShowPerformanceCounters;
3053Bench& Bench::unit(
char const* u) {
3054 if (u != mConfig.mUnit) {
3061Bench& Bench::unit(std::string
const& u) {
3062 return unit(u.c_str());
3065std::string
const& Bench::unit() const noexcept {
3066 return mConfig.mUnit;
3069Bench& Bench::timeUnit(std::chrono::duration<double>
const& tu, std::string
const& tuName) {
3070 mConfig.mTimeUnit = tu;
3071 mConfig.mTimeUnitName = tuName;
3075std::string
const& Bench::timeUnitName() const noexcept {
3076 return mConfig.mTimeUnitName;
3079std::chrono::duration<double>
const& Bench::timeUnit() const noexcept {
3080 return mConfig.mTimeUnit;
3084Bench& Bench::title(
const char* benchmarkTitle) {
3085 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3088 mConfig.mBenchmarkTitle = benchmarkTitle;
3091Bench& Bench::title(std::string
const& benchmarkTitle) {
3092 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3095 mConfig.mBenchmarkTitle = benchmarkTitle;
3099std::string
const& Bench::title() const noexcept {
3100 return mConfig.mBenchmarkTitle;
3104 mConfig.mBenchmarkName = benchmarkName;
3108Bench&
Bench::name(std::string
const& benchmarkName) {
3109 mConfig.mBenchmarkName = benchmarkName;
3114 return mConfig.mBenchmarkName;
3118Bench& Bench::epochs(
size_t numEpochs)
noexcept {
3119 mConfig.mNumEpochs = numEpochs;
3122size_t Bench::epochs() const noexcept {
3123 return mConfig.mNumEpochs;
3127Bench& Bench::clockResolutionMultiple(
size_t multiple)
noexcept {
3128 mConfig.mClockResolutionMultiple = multiple;
3131size_t Bench::clockResolutionMultiple() const noexcept {
3132 return mConfig.mClockResolutionMultiple;
3136Bench& Bench::maxEpochTime(std::chrono::nanoseconds t)
noexcept {
3137 mConfig.mMaxEpochTime = t;
3140std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3141 return mConfig.mMaxEpochTime;
3145Bench& Bench::minEpochTime(std::chrono::nanoseconds t)
noexcept {
3146 mConfig.mMinEpochTime = t;
3149std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3150 return mConfig.mMinEpochTime;
3153Bench& Bench::minEpochIterations(uint64_t numIters)
noexcept {
3154 mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3157uint64_t Bench::minEpochIterations() const noexcept {
3158 return mConfig.mMinEpochIterations;
3161Bench& Bench::epochIterations(uint64_t numIters)
noexcept {
3162 mConfig.mEpochIterations = numIters;
3165uint64_t Bench::epochIterations() const noexcept {
3166 return mConfig.mEpochIterations;
3169Bench& Bench::warmup(uint64_t numWarmupIters)
noexcept {
3170 mConfig.mWarmup = numWarmupIters;
3173uint64_t Bench::warmup() const noexcept {
3174 return mConfig.mWarmup;
3177Bench& Bench::config(
Config const& benchmarkConfig) {
3178 mConfig = benchmarkConfig;
3181Config const& Bench::config() const noexcept {
3185Bench& Bench::output(std::ostream* outstream)
noexcept {
3186 mConfig.mOut = outstream;
3191 return mConfig.mOut;
3194std::vector<Result>
const& Bench::results() const noexcept {
3198Bench&
Bench::render(
char const* templateContent, std::ostream& os) {
3203Bench&
Bench::render(std::string
const& templateContent, std::ostream& os) {
3208std::vector<BigO> Bench::complexityBigO()
const {
3209 std::vector<BigO> bigOs;
3210 auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3211 bigOs.emplace_back(
"O(1)", rangeMeasure, [](
double) {
3214 bigOs.emplace_back(
"O(n)", rangeMeasure, [](
double n) {
3217 bigOs.emplace_back(
"O(log n)", rangeMeasure, [](
double n) {
3218 return std::log2(n);
3220 bigOs.emplace_back(
"O(n log n)", rangeMeasure, [](
double n) {
3221 return n * std::log2(n);
3223 bigOs.emplace_back(
"O(n^2)", rangeMeasure, [](
double n) {
3226 bigOs.emplace_back(
"O(n^3)", rangeMeasure, [](
double n) {
3229 std::sort(bigOs.begin(), bigOs.end());
3236 std::random_device rd;
3237 std::uniform_int_distribution<uint64_t> dist;
3241 }
while (mX == 0 && mY == 0);
3245uint64_t splitMix64(uint64_t& state) noexcept {
3246 uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3247 z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3248 z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3249 return z ^ (z >> 31U);
3253Rng::Rng(uint64_t seed) noexcept
3254 : mX(splitMix64(seed))
3255 , mY(splitMix64(seed)) {
3256 for (
size_t i = 0; i < 10; ++i) {
3262Rng::Rng(uint64_t x, uint64_t y) noexcept
3266Rng Rng::copy() const noexcept {
3270Rng::Rng(std::vector<uint64_t>
const& data)
3273 if (data.size() != 2) {
3274 throw std::runtime_error(
"ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3275 detail::fmt::to_s(data.size()));
3281std::vector<uint64_t> Rng::state()
const {
3282 std::vector<uint64_t> data(2);
3288BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result>
const& results) {
3289 BigO::RangeMeasure rangeMeasure;
3290 for (
auto const& result : results) {
3291 if (result.config().mComplexityN > 0.0) {
3292 rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3295 return rangeMeasure;
3298BigO::BigO(std::string
const& bigOName, RangeMeasure
const& rangeMeasure)
3302 double sumRangeMeasure = 0.0;
3303 double sumRangeRange = 0.0;
3305 for (
size_t i = 0; i < rangeMeasure.size(); ++i) {
3306 sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second;
3307 sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first;
3309 mConstant = sumRangeMeasure / sumRangeRange;
3313 double sumMeasure = 0.0;
3314 for (
size_t i = 0; i < rangeMeasure.size(); ++i) {
3315 auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second;
3318 sumMeasure += rangeMeasure[i].second;
3321 auto n =
static_cast<double>(rangeMeasure.size());
3322 auto mean = sumMeasure / n;
3323 mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3326BigO::BigO(
const char* bigOName, RangeMeasure
const& rangeMeasure)
3327 : BigO(
std::string(bigOName), rangeMeasure) {}
3329std::string
const&
BigO::name() const noexcept {
3333double BigO::constant() const noexcept {
3337double BigO::normalizedRootMeanSquare() const noexcept {
3338 return mNormalizedRootMeanSquare;
3342 return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3345std::ostream&
operator<<(std::ostream& os, BigO
const& bigO) {
3346 return os << bigO.constant() <<
" * " << bigO.name() <<
", rms=" << bigO.normalizedRootMeanSquare();
3349std::ostream&
operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO>
const& bigOs) {
3350 detail::fmt::StreamStateRestorer restorer(os);
3351 os << std::endl <<
"| coefficient | err% | complexity" << std::endl <<
"|--------------:|-------:|------------" << std::endl;
3352 for (
auto const& bigO : bigOs) {
3353 os <<
"|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() <<
" ";
3354 os <<
"|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) <<
"% ";
3355 os <<
"| " << bigO.name();
Config & operator=(const Config &)=delete
Main entry point to nanobench's benchmarking facility.
Bench & complexityN(T b) noexcept
Bench & operator=(Bench const &other)
ANKERL_NANOBENCH(NODISCARD) std Bench & batch(T b) noexcept
Sets the batch size.
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Bench & operator=(Bench &&other)
std::vector< BigO > complexityBigO() const
Bench()
Creates a new benchmark for configuration and running of benchmarks.
Bench(Bench const &other)
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
std::vector< std::pair< double, double > > RangeMeasure
BigO(std::string const &bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
BigO(std::string const &bigOName, RangeMeasure const &scaledRangeMeasure)
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
BigO(char const *bigOName, RangeMeasure const &scaledRangeMeasure)
static Measure fromString(std::string const &str)
Result(Config const &benchmarkConfig)
Result & operator=(Result const &)
Result & operator=(Result &&)
Result(Result &&) noexcept
An extremely fast random generator.
static constexpr uint64_t() min()
Rng(Rng const &)=delete
As a safety precausion, we don't allow copying.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Rng(Rng &&) noexcept=default
Rng & operator=(Rng const &)=delete
Same as Rng(Rng const&), we don't allow assignment.
static constexpr uint64_t() max()
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
uint64_t result_type
This RNG provides 64bit randomness.
void moveResultTo(std::vector< Result > &results) noexcept
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept
IterationLogic(Bench const &config) noexcept
PerformanceCounters & performanceCounters()
void doNotOptimizeAway(T const &val)
char const * json() noexcept
Template to generate JSON data.
char const * csv() noexcept
CSV data for the benchmark results.
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzations.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Implement std::hash so RCUPtr can be used as a key for maps or sets.
#define ANKERL_NANOBENCH_LOG(x)
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
#define ANKERL_NANOBENCH(x)
bool operator==(const CNetAddr &a, const CNetAddr &b)
bool operator<(const CNetAddr &a, const CNetAddr &b)
Config & operator=(Config &&)
Config & operator=(Config const &)
Config(Config &&) noexcept