Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 29 additions & 21 deletions src/perf_counters.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include "perf_counters.h"

#include <cstring>
#include <memory>
#include <vector>

#if defined HAVE_LIBPFM
Expand All @@ -29,29 +28,37 @@ namespace internal {
#if defined HAVE_LIBPFM

size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
// Create a pointer for multiple reads
const size_t bufsize = values_.size() * sizeof(values_[0]);
char* ptr = reinterpret_cast<char*>(values_.data());
size_t size = bufsize;
// See man page of perf_event_open for the exact data format
// We are using PERF_FORMAT_GROUP |
// PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING
struct {
uint64_t nr_;
uint64_t time_enabled_;
uint64_t time_running_;
std::array<uint64_t, kMaxCounters> values_;
} buffer{};

// Any valid read will have nr_, time_enabled_, time_running_,
// and at least one value_
const size_t minimum_read_size = 4 * sizeof(uint64_t);

Value* current_counter = values_.data();
for (int lead : leaders) {
auto read_bytes = ::read(lead, ptr, size);
if (read_bytes >= ssize_t(sizeof(uint64_t))) {
// Actual data bytes are all bytes minus initial padding
std::size_t data_bytes =
static_cast<std::size_t>(read_bytes) - sizeof(uint64_t);
// This should be very cheap since it's in hot cache
std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
// Increment our counters
ptr += data_bytes;
size -= data_bytes;
} else {
ssize_t read_bytes = ::read(lead, &buffer, sizeof(buffer));

if (read_bytes < static_cast<ssize_t>(minimum_read_size)) {
int err = errno;
GetErrorLogInstance() << "Error reading lead " << lead << " errno:" << err
<< " " << ::strerror(err) << "\n";
return 0;
}

for (size_t i = 0; i < buffer.nr_; i++, current_counter++) {
current_counter->Set(buffer.time_enabled_, buffer.time_running_,
buffer.values_[i]);
}
}
return (bufsize - size) / sizeof(uint64_t);
return current_counter - values_.data();
}

const bool PerfCounters::kSupported = true;
Expand Down Expand Up @@ -125,7 +132,7 @@ PerfCounters PerfCounters::Create(
}

// Here first means first in group, ie the group leader
const bool is_first = (group_id < 0);
bool is_first = (group_id < 0);

// This struct will be populated by libpfm from the counter string
// and then fed into the syscall perf_event_open
Expand All @@ -149,14 +156,13 @@ PerfCounters PerfCounters::Create(
// case.
attr.disabled = is_first;
attr.inherit = true;
attr.pinned = is_first;
attr.exclude_kernel = true;
attr.exclude_user = false;
attr.exclude_hv = true;

// Read all counters in a group in one read.
attr.read_format = PERF_FORMAT_GROUP; //| PERF_FORMAT_TOTAL_TIME_ENABLED |
// PERF_FORMAT_TOTAL_TIME_RUNNING;
attr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED |
PERF_FORMAT_TOTAL_TIME_RUNNING;

int id = -1;
while (id < 0) {
Expand All @@ -175,6 +181,8 @@ PerfCounters PerfCounters::Create(
if (group_id >= 0) {
// Create a new group
group_id = -1;
is_first = true;
attr.disabled = is_first;
} else {
// At this point we have already retried to set a new group id and
// failed. We then give up.
Expand Down
72 changes: 51 additions & 21 deletions src/perf_counters.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,46 +39,77 @@
namespace benchmark {
namespace internal {

// Typically, we can only read a small number of counters. There is also a
// padding preceding counter values, when reading multiple counters with one
// syscall (which is desirable). PerfCounterValues abstracts these details.
// Typically, we only read a small number of counters. There is also a
// specific format when reading multiple counters with one syscall
// (which is desirable). PerfCounterValues abstracts these details.
// The implementation ensures the storage is inlined, and allows 0-based
// indexing into the counter values.
// The object is used in conjunction with a PerfCounters object, by passing it
// to Snapshot(). The Read() method relocates individual reads, discarding
// the initial padding from each group leader in the values buffer such that
// all user accesses through the [] operator are correct.
// to Snapshot(). The Read() method reads in the appropriate fields into
// the values buffer such that all user accesses through the [] operator are
// correct.
// The [] operator estimates the true value of the counter using the
// time_enabled and time_running values (which will be equal if there is no
// multiplexing, i.e only a single group).
class BENCHMARK_EXPORT PerfCounterValues {
public:
explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
BM_CHECK_LE(nr_counters_, kMaxCounters);
}

// We are reading correctly now so the values don't need to skip padding
uint64_t operator[](size_t pos) const { return values_[pos]; }
double operator[](size_t pos) const { return values_[pos].GetEstimate(); }

PerfCounterValues operator-=(const PerfCounterValues& counter_values) {
for (size_t i = 0; i < nr_counters_; i++) {
values_[i] -= counter_values.values_[i];
}

return *this;
}

// Increased the maximum to 32 only since the buffer
// is std::array<> backed
static constexpr size_t kMaxCounters = 32;

private:
// Represents the value of a counter.
// time_enabled_ = time_running_ if there is no multiplexing,
// i.e only a single group.
class Value {
public:
void Set(uint64_t time_enabled, uint64_t time_running, uint64_t value) {
time_enabled_ = time_enabled;
time_running_ = time_running;
value_ = value;
}

double GetEstimate() const {
return static_cast<double>(value_ * time_enabled_) /
static_cast<double>(time_running_);
}

Value operator-=(const Value& value) {
time_enabled_ -= value.time_enabled_;
time_running_ -= value.time_running_;
value_ -= value.value_;

return *this;
}

private:
uint64_t time_enabled_;
uint64_t time_running_;
uint64_t value_;
};

friend class PerfCounters;
// Get the byte buffer in which perf counters can be captured.
// This is used by PerfCounters::Read
std::pair<char*, size_t> get_data_buffer() {
return {reinterpret_cast<char*>(values_.data()),
sizeof(uint64_t) * (kPadding + nr_counters_)};
}

// This reading is complex and as the goal of this class is to
// abstract away the intrincacies of the reading process, this is
// a better place for it
size_t Read(const std::vector<int>& leaders);

// Move the padding to 2 due to the reading algorithm (1st padding plus a
// current read padding)
static constexpr size_t kPadding = 2;
std::array<uint64_t, kPadding + kMaxCounters> values_;
std::array<Value, kMaxCounters> values_;
const size_t nr_counters_;
};

Expand Down Expand Up @@ -174,10 +205,9 @@ class BENCHMARK_EXPORT PerfCountersMeasurement final {
valid_read_ &= counters_.Snapshot(&end_values_);
ClobberMemory();

end_values_ -= start_values_;
for (size_t i = 0; i < counters_.names().size(); ++i) {
double measurement = static_cast<double>(end_values_[i]) -
static_cast<double>(start_values_[i]);
measurements.push_back({counters_.names()[i], measurement});
measurements.push_back({counters_.names()[i], end_values_[i]});
}

return valid_read_;
Expand Down