1 //===-- PerfContextSwitchDecoder.cpp --======------------------------------===//
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 
8 #include "PerfContextSwitchDecoder.h"
9 #include <optional>
10 
11 using namespace lldb;
12 using namespace lldb_private;
13 using namespace lldb_private::trace_intel_pt;
14 using namespace llvm;
15 
16 /// Copied from <linux/perf_event.h> to avoid depending on perf_event.h on
17 /// non-linux platforms.
18 /// \{
19 #define PERF_RECORD_MISC_SWITCH_OUT (1 << 13)
20 
21 #define PERF_RECORD_LOST 2
22 #define PERF_RECORD_THROTTLE 5
23 #define PERF_RECORD_UNTHROTTLE 6
24 #define PERF_RECORD_LOST_SAMPLES 13
25 #define PERF_RECORD_SWITCH_CPU_WIDE 15
26 #define PERF_RECORD_MAX 19
27 
28 struct perf_event_header {
29   uint32_t type;
30   uint16_t misc;
31   uint16_t size;
32 
33   /// \return
34   ///   An \a llvm::Error if the record looks obviously wrong, or \a
35   ///   llvm::Error::success() otherwise.
SanityCheckperf_event_header36   Error SanityCheck() const {
37     // The following checks are based on visual inspection of the records and
38     // enums in
39     // https://elixir.bootlin.com/linux/v4.8/source/include/uapi/linux/perf_event.h
40     // See PERF_RECORD_MAX, PERF_RECORD_SWITCH and the data similar records
41     // hold.
42 
43     // A record of too many uint64_t's or more should mean that the data is
44     // wrong
45     const uint64_t max_valid_size_bytes = 8000;
46     if (size == 0 || size > max_valid_size_bytes)
47       return createStringError(
48           inconvertibleErrorCode(),
49           formatv("A record of {0} bytes was found.", size));
50 
51     // We add some numbers to PERF_RECORD_MAX because some systems might have
52     // custom records. In any case, we are looking only for abnormal data.
53     if (type >= PERF_RECORD_MAX + 100)
54       return createStringError(
55           inconvertibleErrorCode(),
56           formatv("Invalid record type {0} was found.", type));
57     return Error::success();
58   }
59 
IsContextSwitchRecordperf_event_header60   bool IsContextSwitchRecord() const {
61     return type == PERF_RECORD_SWITCH_CPU_WIDE;
62   }
63 
IsErrorRecordperf_event_header64   bool IsErrorRecord() const {
65     return type == PERF_RECORD_LOST || type == PERF_RECORD_THROTTLE ||
66            type == PERF_RECORD_UNTHROTTLE || type == PERF_RECORD_LOST_SAMPLES;
67   }
68 };
69 /// \}
70 
71 /// Record found in the perf_event context switch traces. It might contain
72 /// additional fields in memory, but header.size should have the actual size
73 /// of the record.
74 struct PerfContextSwitchRecord {
75   struct perf_event_header header;
76   uint32_t next_prev_pid;
77   uint32_t next_prev_tid;
78   uint32_t pid, tid;
79   uint64_t time_in_nanos;
80 
IsOutPerfContextSwitchRecord81   bool IsOut() const { return header.misc & PERF_RECORD_MISC_SWITCH_OUT; }
82 };
83 
84 /// Record produced after parsing the raw context switch trace produce by
85 /// perf_event. A major difference between this struct and
86 /// PerfContextSwitchRecord is that this one uses tsc instead of nanos.
87 struct ContextSwitchRecord {
88   uint64_t tsc;
89   /// Whether the switch is in or out
90   bool is_out;
91   /// pid = 0 and tid = 0 indicate the swapper or idle process, which normally
92   /// runs after a context switch out of a normal user thread.
93   lldb::pid_t pid;
94   lldb::tid_t tid;
95 
IsOutContextSwitchRecord96   bool IsOut() const { return is_out; }
97 
IsInContextSwitchRecord98   bool IsIn() const { return !is_out; }
99 };
100 
GetLowestKnownTSC() const101 uint64_t ThreadContinuousExecution::GetLowestKnownTSC() const {
102   switch (variant) {
103   case Variant::Complete:
104     return tscs.complete.start;
105   case Variant::OnlyStart:
106     return tscs.only_start.start;
107   case Variant::OnlyEnd:
108     return tscs.only_end.end;
109   case Variant::HintedEnd:
110     return tscs.hinted_end.start;
111   case Variant::HintedStart:
112     return tscs.hinted_start.end;
113   }
114 }
115 
GetStartTSC() const116 uint64_t ThreadContinuousExecution::GetStartTSC() const {
117   switch (variant) {
118   case Variant::Complete:
119     return tscs.complete.start;
120   case Variant::OnlyStart:
121     return tscs.only_start.start;
122   case Variant::OnlyEnd:
123     return 0;
124   case Variant::HintedEnd:
125     return tscs.hinted_end.start;
126   case Variant::HintedStart:
127     return tscs.hinted_start.hinted_start;
128   }
129 }
130 
GetEndTSC() const131 uint64_t ThreadContinuousExecution::GetEndTSC() const {
132   switch (variant) {
133   case Variant::Complete:
134     return tscs.complete.end;
135   case Variant::OnlyStart:
136     return std::numeric_limits<uint64_t>::max();
137   case Variant::OnlyEnd:
138     return tscs.only_end.end;
139   case Variant::HintedEnd:
140     return tscs.hinted_end.hinted_end;
141   case Variant::HintedStart:
142     return tscs.hinted_start.end;
143   }
144 }
145 
CreateCompleteExecution(lldb::cpu_id_t cpu_id,lldb::tid_t tid,lldb::pid_t pid,uint64_t start,uint64_t end)146 ThreadContinuousExecution ThreadContinuousExecution::CreateCompleteExecution(
147     lldb::cpu_id_t cpu_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t start,
148     uint64_t end) {
149   ThreadContinuousExecution o(cpu_id, tid, pid);
150   o.variant = Variant::Complete;
151   o.tscs.complete.start = start;
152   o.tscs.complete.end = end;
153   return o;
154 }
155 
CreateHintedStartExecution(lldb::cpu_id_t cpu_id,lldb::tid_t tid,lldb::pid_t pid,uint64_t hinted_start,uint64_t end)156 ThreadContinuousExecution ThreadContinuousExecution::CreateHintedStartExecution(
157     lldb::cpu_id_t cpu_id, lldb::tid_t tid, lldb::pid_t pid,
158     uint64_t hinted_start, uint64_t end) {
159   ThreadContinuousExecution o(cpu_id, tid, pid);
160   o.variant = Variant::HintedStart;
161   o.tscs.hinted_start.hinted_start = hinted_start;
162   o.tscs.hinted_start.end = end;
163   return o;
164 }
165 
CreateHintedEndExecution(lldb::cpu_id_t cpu_id,lldb::tid_t tid,lldb::pid_t pid,uint64_t start,uint64_t hinted_end)166 ThreadContinuousExecution ThreadContinuousExecution::CreateHintedEndExecution(
167     lldb::cpu_id_t cpu_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t start,
168     uint64_t hinted_end) {
169   ThreadContinuousExecution o(cpu_id, tid, pid);
170   o.variant = Variant::HintedEnd;
171   o.tscs.hinted_end.start = start;
172   o.tscs.hinted_end.hinted_end = hinted_end;
173   return o;
174 }
175 
CreateOnlyEndExecution(lldb::cpu_id_t cpu_id,lldb::tid_t tid,lldb::pid_t pid,uint64_t end)176 ThreadContinuousExecution ThreadContinuousExecution::CreateOnlyEndExecution(
177     lldb::cpu_id_t cpu_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t end) {
178   ThreadContinuousExecution o(cpu_id, tid, pid);
179   o.variant = Variant::OnlyEnd;
180   o.tscs.only_end.end = end;
181   return o;
182 }
183 
CreateOnlyStartExecution(lldb::cpu_id_t cpu_id,lldb::tid_t tid,lldb::pid_t pid,uint64_t start)184 ThreadContinuousExecution ThreadContinuousExecution::CreateOnlyStartExecution(
185     lldb::cpu_id_t cpu_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t start) {
186   ThreadContinuousExecution o(cpu_id, tid, pid);
187   o.variant = Variant::OnlyStart;
188   o.tscs.only_start.start = start;
189   return o;
190 }
191 
RecoverExecutionsFromConsecutiveRecords(cpu_id_t cpu_id,const LinuxPerfZeroTscConversion & tsc_conversion,const ContextSwitchRecord & current_record,const std::optional<ContextSwitchRecord> & prev_record,std::function<void (const ThreadContinuousExecution & execution)> on_new_execution)192 static Error RecoverExecutionsFromConsecutiveRecords(
193     cpu_id_t cpu_id, const LinuxPerfZeroTscConversion &tsc_conversion,
194     const ContextSwitchRecord &current_record,
195     const std::optional<ContextSwitchRecord> &prev_record,
196     std::function<void(const ThreadContinuousExecution &execution)>
197         on_new_execution) {
198   if (!prev_record) {
199     if (current_record.IsOut()) {
200       on_new_execution(ThreadContinuousExecution::CreateOnlyEndExecution(
201           cpu_id, current_record.tid, current_record.pid, current_record.tsc));
202     }
203     // The 'in' case will be handled later when we try to look for its end
204     return Error::success();
205   }
206 
207   const ContextSwitchRecord &prev = *prev_record;
208   if (prev.tsc >= current_record.tsc)
209     return createStringError(
210         inconvertibleErrorCode(),
211         formatv("A context switch record doesn't happen after the previous "
212                 "record. Previous TSC= {0}, current TSC = {1}.",
213                 prev.tsc, current_record.tsc));
214 
215   if (current_record.IsIn() && prev.IsIn()) {
216     // We found two consecutive ins, which means that we didn't capture
217     // the end of the previous execution.
218     on_new_execution(ThreadContinuousExecution::CreateHintedEndExecution(
219         cpu_id, prev.tid, prev.pid, prev.tsc, current_record.tsc - 1));
220   } else if (current_record.IsOut() && prev.IsOut()) {
221     // We found two consecutive outs, that means that we didn't capture
222     // the beginning of the current execution.
223     on_new_execution(ThreadContinuousExecution::CreateHintedStartExecution(
224         cpu_id, current_record.tid, current_record.pid, prev.tsc + 1,
225         current_record.tsc));
226   } else if (current_record.IsOut() && prev.IsIn()) {
227     if (current_record.pid == prev.pid && current_record.tid == prev.tid) {
228       /// A complete execution
229       on_new_execution(ThreadContinuousExecution::CreateCompleteExecution(
230           cpu_id, current_record.tid, current_record.pid, prev.tsc,
231           current_record.tsc));
232     } else {
233       // An out after the in of a different thread. The first one doesn't
234       // have an end, and the second one doesn't have a start.
235       on_new_execution(ThreadContinuousExecution::CreateHintedEndExecution(
236           cpu_id, prev.tid, prev.pid, prev.tsc, current_record.tsc - 1));
237       on_new_execution(ThreadContinuousExecution::CreateHintedStartExecution(
238           cpu_id, current_record.tid, current_record.pid, prev.tsc + 1,
239           current_record.tsc));
240     }
241   }
242   return Error::success();
243 }
244 
245 Expected<std::vector<ThreadContinuousExecution>>
DecodePerfContextSwitchTrace(ArrayRef<uint8_t> data,cpu_id_t cpu_id,const LinuxPerfZeroTscConversion & tsc_conversion)246 lldb_private::trace_intel_pt::DecodePerfContextSwitchTrace(
247     ArrayRef<uint8_t> data, cpu_id_t cpu_id,
248     const LinuxPerfZeroTscConversion &tsc_conversion) {
249 
250   std::vector<ThreadContinuousExecution> executions;
251 
252   // This offset is used to create the error message in case of failures.
253   size_t offset = 0;
254 
255   auto do_decode = [&]() -> Error {
256     std::optional<ContextSwitchRecord> prev_record;
257     while (offset < data.size()) {
258       const perf_event_header &perf_record =
259           *reinterpret_cast<const perf_event_header *>(data.data() + offset);
260       if (Error err = perf_record.SanityCheck())
261         return err;
262 
263       if (perf_record.IsContextSwitchRecord()) {
264         const PerfContextSwitchRecord &context_switch_record =
265             *reinterpret_cast<const PerfContextSwitchRecord *>(data.data() +
266                                                                offset);
267         ContextSwitchRecord record{
268             tsc_conversion.ToTSC(context_switch_record.time_in_nanos),
269             context_switch_record.IsOut(),
270             static_cast<lldb::pid_t>(context_switch_record.pid),
271             static_cast<lldb::tid_t>(context_switch_record.tid)};
272 
273         if (Error err = RecoverExecutionsFromConsecutiveRecords(
274                 cpu_id, tsc_conversion, record, prev_record,
275                 [&](const ThreadContinuousExecution &execution) {
276                   executions.push_back(execution);
277                 }))
278           return err;
279 
280         prev_record = record;
281       }
282       offset += perf_record.size;
283     }
284 
285     // We might have an incomplete last record
286     if (prev_record && prev_record->IsIn())
287       executions.push_back(ThreadContinuousExecution::CreateOnlyStartExecution(
288           cpu_id, prev_record->tid, prev_record->pid, prev_record->tsc));
289     return Error::success();
290   };
291 
292   if (Error err = do_decode())
293     return createStringError(inconvertibleErrorCode(),
294                              formatv("Malformed perf context switch trace for "
295                                      "cpu {0} at offset {1}. {2}",
296                                      cpu_id, offset, toString(std::move(err))));
297 
298   return executions;
299 }
300 
301 Expected<std::vector<uint8_t>>
FilterProcessesFromContextSwitchTrace(llvm::ArrayRef<uint8_t> data,const std::set<lldb::pid_t> & pids)302 lldb_private::trace_intel_pt::FilterProcessesFromContextSwitchTrace(
303     llvm::ArrayRef<uint8_t> data, const std::set<lldb::pid_t> &pids) {
304   size_t offset = 0;
305   std::vector<uint8_t> out_data;
306 
307   while (offset < data.size()) {
308     const perf_event_header &perf_record =
309         *reinterpret_cast<const perf_event_header *>(data.data() + offset);
310     if (Error err = perf_record.SanityCheck())
311       return std::move(err);
312     bool should_copy = false;
313     if (perf_record.IsContextSwitchRecord()) {
314       const PerfContextSwitchRecord &context_switch_record =
315           *reinterpret_cast<const PerfContextSwitchRecord *>(data.data() +
316                                                              offset);
317       if (pids.count(context_switch_record.pid))
318         should_copy = true;
319     } else if (perf_record.IsErrorRecord()) {
320       should_copy = true;
321     }
322 
323     if (should_copy) {
324       for (size_t i = 0; i < perf_record.size; i++) {
325         out_data.push_back(data[offset + i]);
326       }
327     }
328 
329     offset += perf_record.size;
330   }
331   return out_data;
332 }
333