diff --git a/src/perf_counters.cc b/src/perf_counters.cc index f47aa7b42..dcb6e7cc8 100644 --- a/src/perf_counters.cc +++ b/src/perf_counters.cc @@ -15,7 +15,6 @@ #include "perf_counters.h" #include -#include #include #if defined HAVE_LIBPFM @@ -29,29 +28,37 @@ namespace internal { #if defined HAVE_LIBPFM size_t PerfCounterValues::Read(const std::vector& leaders) { - // Create a pointer for multiple reads - const size_t bufsize = values_.size() * sizeof(values_[0]); - char* ptr = reinterpret_cast(values_.data()); - size_t size = bufsize; + // See man page of perf_event_open for the exact data format + // We are using PERF_FORMAT_GROUP | + // PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING + struct { + uint64_t nr_; + uint64_t time_enabled_; + uint64_t time_running_; + std::array values_; + } buffer{}; + + // Any valid read will have nr_, time_enabled_, time_running_, + // and at least one value_ + const size_t minimum_read_size = 4 * sizeof(uint64_t); + + Value* current_counter = values_.data(); for (int lead : leaders) { - auto read_bytes = ::read(lead, ptr, size); - if (read_bytes >= ssize_t(sizeof(uint64_t))) { - // Actual data bytes are all bytes minus initial padding - std::size_t data_bytes = - static_cast(read_bytes) - sizeof(uint64_t); - // This should be very cheap since it's in hot cache - std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes); - // Increment our counters - ptr += data_bytes; - size -= data_bytes; - } else { + ssize_t read_bytes = ::read(lead, &buffer, sizeof(buffer)); + + if (read_bytes < static_cast(minimum_read_size)) { int err = errno; GetErrorLogInstance() << "Error reading lead " << lead << " errno:" << err << " " << ::strerror(err) << "\n"; return 0; } + + for (size_t i = 0; i < buffer.nr_; i++, current_counter++) { + current_counter->Set(buffer.time_enabled_, buffer.time_running_, + buffer.values_[i]); + } } - return (bufsize - size) / sizeof(uint64_t); + return current_counter - values_.data(); } const bool PerfCounters::kSupported = true; @@ -125,7 +132,7 @@ PerfCounters PerfCounters::Create( } // Here first means first in group, ie the group leader - const bool is_first = (group_id < 0); + bool is_first = (group_id < 0); // This struct will be populated by libpfm from the counter string // and then fed into the syscall perf_event_open @@ -149,14 +156,13 @@ PerfCounters PerfCounters::Create( // case. attr.disabled = is_first; attr.inherit = true; - attr.pinned = is_first; attr.exclude_kernel = true; attr.exclude_user = false; attr.exclude_hv = true; // Read all counters in a group in one read. - attr.read_format = PERF_FORMAT_GROUP; //| PERF_FORMAT_TOTAL_TIME_ENABLED | - // PERF_FORMAT_TOTAL_TIME_RUNNING; + attr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; int id = -1; while (id < 0) { @@ -175,6 +181,8 @@ PerfCounters PerfCounters::Create( if (group_id >= 0) { // Create a new group group_id = -1; + is_first = true; + attr.disabled = is_first; } else { // At this point we have already retried to set a new group id and // failed. We then give up. diff --git a/src/perf_counters.h b/src/perf_counters.h index bf5eb6bc3..64044966b 100644 --- a/src/perf_counters.h +++ b/src/perf_counters.h @@ -39,46 +39,77 @@ namespace benchmark { namespace internal { -// Typically, we can only read a small number of counters. There is also a -// padding preceding counter values, when reading multiple counters with one -// syscall (which is desirable). PerfCounterValues abstracts these details. +// Typically, we only read a small number of counters. There is also a +// specific format when reading multiple counters with one syscall +// (which is desirable). PerfCounterValues abstracts these details. // The implementation ensures the storage is inlined, and allows 0-based // indexing into the counter values. // The object is used in conjunction with a PerfCounters object, by passing it -// to Snapshot(). The Read() method relocates individual reads, discarding -// the initial padding from each group leader in the values buffer such that -// all user accesses through the [] operator are correct. +// to Snapshot(). The Read() method reads in the appropriate fields into +// the values buffer such that all user accesses through the [] operator are +// correct. +// The [] operator estimates the true value of the counter using the +// time_enabled and time_running values (which will be equal if there is no +// multiplexing, i.e only a single group). class BENCHMARK_EXPORT PerfCounterValues { public: explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) { BM_CHECK_LE(nr_counters_, kMaxCounters); } - // We are reading correctly now so the values don't need to skip padding - uint64_t operator[](size_t pos) const { return values_[pos]; } + double operator[](size_t pos) const { return values_[pos].GetEstimate(); } + + PerfCounterValues operator-=(const PerfCounterValues& counter_values) { + for (size_t i = 0; i < nr_counters_; i++) { + values_[i] -= counter_values.values_[i]; + } + + return *this; + } // Increased the maximum to 32 only since the buffer // is std::array<> backed static constexpr size_t kMaxCounters = 32; private: + // Represents the value of a counter. + // time_enabled_ = time_running_ if there is no multiplexing, + // i.e only a single group. + class Value { + public: + void Set(uint64_t time_enabled, uint64_t time_running, uint64_t value) { + time_enabled_ = time_enabled; + time_running_ = time_running; + value_ = value; + } + + double GetEstimate() const { + return static_cast(value_ * time_enabled_) / + static_cast(time_running_); + } + + Value operator-=(const Value& value) { + time_enabled_ -= value.time_enabled_; + time_running_ -= value.time_running_; + value_ -= value.value_; + + return *this; + } + + private: + uint64_t time_enabled_; + uint64_t time_running_; + uint64_t value_; + }; + friend class PerfCounters; - // Get the byte buffer in which perf counters can be captured. - // This is used by PerfCounters::Read - std::pair get_data_buffer() { - return {reinterpret_cast(values_.data()), - sizeof(uint64_t) * (kPadding + nr_counters_)}; - } // This reading is complex and as the goal of this class is to // abstract away the intrincacies of the reading process, this is // a better place for it size_t Read(const std::vector& leaders); - // Move the padding to 2 due to the reading algorithm (1st padding plus a - // current read padding) - static constexpr size_t kPadding = 2; - std::array values_; + std::array values_; const size_t nr_counters_; }; @@ -174,10 +205,9 @@ class BENCHMARK_EXPORT PerfCountersMeasurement final { valid_read_ &= counters_.Snapshot(&end_values_); ClobberMemory(); + end_values_ -= start_values_; for (size_t i = 0; i < counters_.names().size(); ++i) { - double measurement = static_cast(end_values_[i]) - - static_cast(start_values_[i]); - measurements.push_back({counters_.names()[i], measurement}); + measurements.push_back({counters_.names()[i], end_values_[i]}); } return valid_read_;