1 //===-- X86Counter.cpp ------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "X86Counter.h"
10 
11 #if defined(__linux__) && defined(HAVE_LIBPFM) &&                              \
12     defined(LIBPFM_HAS_FIELD_CYCLES)
13 
14 // FIXME: Use appropriate wrappers for poll.h and mman.h
15 // to support Windows and remove this linux-only guard.
16 
17 #include "llvm/Support/Endian.h"
18 #include "llvm/Support/Errc.h"
19 
20 #include <perfmon/perf_event.h>
21 #include <perfmon/pfmlib.h>
22 #include <perfmon/pfmlib_perf_event.h>
23 
24 #include <atomic>
25 #include <chrono>
26 #include <cstddef>
27 #include <cstdint>
28 #include <limits>
29 #include <memory>
30 #include <vector>
31 
32 #include <poll.h>
33 #include <sys/mman.h>
34 #include <unistd.h>
35 
36 namespace llvm {
37 namespace exegesis {
38 
39 // Number of entries in the LBR.
40 static constexpr int kLbrEntries = 16;
41 static constexpr size_t kBufferPages = 8;
42 static const size_t kDataBufferSize = kBufferPages * getpagesize();
43 
44 // First page is reserved for perf_event_mmap_page. Data buffer starts on
45 // the next page, so we allocate one more page.
46 static const size_t kMappedBufferSize = (kBufferPages + 1) * getpagesize();
47 
48 // Waits for the LBR perf events.
pollLbrPerfEvent(const int FileDescriptor)49 static int pollLbrPerfEvent(const int FileDescriptor) {
50   struct pollfd PollFd;
51   PollFd.fd = FileDescriptor;
52   PollFd.events = POLLIN;
53   PollFd.revents = 0;
54   return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */);
55 }
56 
57 // Copies the data-buffer into Buf, given the pointer to MMapped.
copyDataBuffer(void * MMappedBuffer,char * Buf,uint64_t Tail,size_t DataSize)58 static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail,
59                            size_t DataSize) {
60   // First page is reserved for perf_event_mmap_page. Data buffer starts on
61   // the next page.
62   char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize();
63   // The LBR buffer is a cyclic buffer, we copy data to another buffer.
64   uint64_t Offset = Tail % kDataBufferSize;
65   size_t CopySize = kDataBufferSize - Offset;
66   memcpy(Buf, Start + Offset, CopySize);
67   if (CopySize >= DataSize)
68     return;
69 
70   memcpy(Buf + CopySize, Start, Offset);
71   return;
72 }
73 
74 // Parses the given data-buffer for stats and fill the CycleArray.
75 // If data has been extracted successfully, also modifies the code to jump
76 // out the benchmark loop.
parseDataBuffer(const char * DataBuf,size_t DataSize,const void * From,const void * To,llvm::SmallVector<int64_t,4> * CycleArray)77 static llvm::Error parseDataBuffer(const char *DataBuf, size_t DataSize,
78                                    const void *From, const void *To,
79                                    llvm::SmallVector<int64_t, 4> *CycleArray) {
80   const char *DataPtr = DataBuf;
81   while (DataPtr < DataBuf + DataSize) {
82     struct perf_event_header Header;
83     memcpy(&Header, DataPtr, sizeof(struct perf_event_header));
84     if (Header.type != PERF_RECORD_SAMPLE) {
85       // Ignores non-sample records.
86       DataPtr += Header.size;
87       continue;
88     }
89     DataPtr += sizeof(Header);
90     uint64_t Count = llvm::support::endian::read64(DataPtr, support::native);
91     DataPtr += sizeof(Count);
92 
93     struct perf_branch_entry Entry;
94     memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
95 
96     // Read the perf_branch_entry array.
97     for (uint64_t i = 0; i < Count; ++i) {
98       const uint64_t BlockStart = From == nullptr
99                                       ? std::numeric_limits<uint64_t>::min()
100                                       : reinterpret_cast<uint64_t>(From);
101       const uint64_t BlockEnd = To == nullptr
102                                     ? std::numeric_limits<uint64_t>::max()
103                                     : reinterpret_cast<uint64_t>(To);
104 
105       if (BlockStart <= Entry.from && BlockEnd >= Entry.to)
106         CycleArray->push_back(Entry.cycles);
107 
108       if (i == Count - 1)
109         // We've reached the last entry.
110         return llvm::Error::success();
111 
112       // Advance to next entry
113       DataPtr += sizeof(Entry);
114       memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
115     }
116   }
117   return llvm::make_error<llvm::StringError>("Unable to parse databuffer.",
118                                              llvm::errc::io_error);
119 }
120 
X86LbrPerfEvent(unsigned SamplingPeriod)121 X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) {
122   assert(SamplingPeriod > 0 && "SamplingPeriod must be positive");
123   EventString = "BR_INST_RETIRED.NEAR_TAKEN";
124   Attr = new perf_event_attr();
125   Attr->size = sizeof(*Attr);
126   Attr->type = PERF_TYPE_RAW;
127   // FIXME This is SKL's encoding. Not sure if it'll change.
128   Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN
129   Attr->sample_type = PERF_SAMPLE_BRANCH_STACK;
130   // Don't need to specify "USER" because we've already excluded HV and Kernel.
131   Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY;
132   Attr->sample_period = SamplingPeriod;
133   Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH.
134   Attr->disabled = 1;
135   Attr->exclude_kernel = 1;
136   Attr->exclude_hv = 1;
137   Attr->read_format = PERF_FORMAT_GROUP;
138 
139   FullQualifiedEventString = EventString;
140 }
141 
X86LbrCounter(pfm::PerfEvent && NewEvent)142 X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent)
143     : Counter(std::move(NewEvent)) {
144   MMappedBuffer = mmap(nullptr, kMappedBufferSize, PROT_READ | PROT_WRITE,
145                        MAP_SHARED, FileDescriptor, 0);
146   if (MMappedBuffer == MAP_FAILED)
147     llvm::errs() << "Failed to mmap buffer.";
148 }
149 
~X86LbrCounter()150 X86LbrCounter::~X86LbrCounter() {
151   if (0 != munmap(MMappedBuffer, kMappedBufferSize))
152     llvm::errs() << "Failed to munmap buffer.";
153 }
154 
start()155 void X86LbrCounter::start() {
156   ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */);
157 }
158 
checkLbrSupport()159 llvm::Error X86LbrCounter::checkLbrSupport() {
160   // Do a sample read and check if the results contain non-zero values.
161 
162   X86LbrCounter counter(X86LbrPerfEvent(123));
163   counter.start();
164 
165   // Prevent the compiler from unrolling the loop and get rid of all the
166   // branches. We need at least 16 iterations.
167   int Sum = 0;
168   int V = 1;
169 
170   volatile int *P = &V;
171   auto TimeLimit =
172       std::chrono::high_resolution_clock::now() + std::chrono::microseconds(5);
173 
174   for (int I = 0;
175        I < kLbrEntries || std::chrono::high_resolution_clock::now() < TimeLimit;
176        ++I) {
177     Sum += *P;
178   }
179 
180   counter.stop();
181   (void)Sum;
182 
183   auto ResultOrError = counter.doReadCounter(nullptr, nullptr);
184   if (ResultOrError)
185     if (!ResultOrError.get().empty())
186       // If there is at least one non-zero entry, then LBR is supported.
187       for (const int64_t &Value : ResultOrError.get())
188         if (Value != 0)
189           return Error::success();
190 
191   return llvm::make_error<llvm::StringError>(
192       "LBR format with cycles is not suppported on the host.",
193       llvm::errc::not_supported);
194 }
195 
196 llvm::Expected<llvm::SmallVector<int64_t, 4>>
readOrError(StringRef FunctionBytes) const197 X86LbrCounter::readOrError(StringRef FunctionBytes) const {
198   // Disable the event before reading
199   ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0);
200 
201   // Find the boundary of the function so that we could filter the LBRs
202   // to keep only the relevant records.
203   if (FunctionBytes.empty())
204     return llvm::make_error<llvm::StringError>("Empty function bytes",
205                                                llvm::errc::invalid_argument);
206   const void *From = reinterpret_cast<const void *>(FunctionBytes.data());
207   const void *To = reinterpret_cast<const void *>(FunctionBytes.data() +
208                                                   FunctionBytes.size());
209   return doReadCounter(From, To);
210 }
211 
212 llvm::Expected<llvm::SmallVector<int64_t, 4>>
doReadCounter(const void * From,const void * To) const213 X86LbrCounter::doReadCounter(const void *From, const void *To) const {
214   // The max number of time-outs/retries before we give up.
215   static constexpr int kMaxTimeouts = 160;
216 
217   // Parses the LBR buffer and fills CycleArray with the sequence of cycle
218   // counts from the buffer.
219   llvm::SmallVector<int64_t, 4> CycleArray;
220   auto DataBuf = std::make_unique<char[]>(kDataBufferSize);
221   int NumTimeouts = 0;
222   int PollResult = 0;
223 
224   while (PollResult <= 0) {
225     PollResult = pollLbrPerfEvent(FileDescriptor);
226     if (PollResult > 0)
227       break;
228     if (PollResult == -1)
229       return llvm::make_error<llvm::StringError>("Cannot poll LBR perf event.",
230                                                  llvm::errc::io_error);
231     if (NumTimeouts++ >= kMaxTimeouts)
232       return llvm::make_error<llvm::StringError>(
233           "LBR polling still timed out after max number of attempts.",
234           llvm::errc::device_or_resource_busy);
235   }
236 
237   struct perf_event_mmap_page Page;
238   memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page));
239 
240   const uint64_t DataTail = Page.data_tail;
241   const uint64_t DataHead = Page.data_head;
242   // We're supposed to use a barrier after reading data_head.
243   std::atomic_thread_fence(std::memory_order_acq_rel);
244   const size_t DataSize = DataHead - DataTail;
245   if (DataSize > kDataBufferSize)
246     return llvm::make_error<llvm::StringError>(
247         "DataSize larger than buffer size.", llvm::errc::invalid_argument);
248 
249   copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize);
250   llvm::Error error =
251       parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray);
252   if (!error)
253     return CycleArray;
254   return std::move(error);
255 }
256 
257 } // namespace exegesis
258 } // namespace llvm
259 
260 #endif // defined(__linux__) && defined(HAVE_LIBPFM) &&
261        // defined(LIBPFM_HAS_FIELD_CYCLES)
262