1 //===-- PerfContextSwitchDecoder.cpp --======------------------------------===//
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 
8 #include "PerfContextSwitchDecoder.h"
9 
10 using namespace lldb;
11 using namespace lldb_private;
12 using namespace lldb_private::trace_intel_pt;
13 using namespace llvm;
14 
15 /// Copied from <linux/perf_event.h> to avoid depending on perf_event.h on
16 /// non-linux platforms.
17 /// \{
18 #define PERF_RECORD_MISC_SWITCH_OUT (1 << 13)
19 
20 #define PERF_RECORD_LOST 2
21 #define PERF_RECORD_THROTTLE 5
22 #define PERF_RECORD_UNTHROTTLE 6
23 #define PERF_RECORD_LOST_SAMPLES 13
24 #define PERF_RECORD_SWITCH_CPU_WIDE 15
25 #define PERF_RECORD_MAX 19
26 
27 struct perf_event_header {
28   uint32_t type;
29   uint16_t misc;
30   uint16_t size;
31 
32   /// \return
33   ///   An \a llvm::Error if the record looks obviously wrong, or \a
34   ///   llvm::Error::success() otherwise.
35   Error SanityCheck() const {
36     // The following checks are based on visual inspection of the records and
37     // enums in
38     // https://elixir.bootlin.com/linux/v4.8/source/include/uapi/linux/perf_event.h
39     // See PERF_RECORD_MAX, PERF_RECORD_SWITCH and the data similar records
40     // hold.
41 
42     // A record of too many uint64_t's or more should mean that the data is
43     // wrong
44     const uint64_t max_valid_size_bytes = 8000;
45     if (size == 0 || size > max_valid_size_bytes)
46       return createStringError(
47           inconvertibleErrorCode(),
48           formatv("A record of {0} bytes was found.", size));
49 
50     // We add some numbers to PERF_RECORD_MAX because some systems might have
51     // custom records. In any case, we are looking only for abnormal data.
52     if (type >= PERF_RECORD_MAX + 100)
53       return createStringError(
54           inconvertibleErrorCode(),
55           formatv("Invalid record type {0} was found.", type));
56     return Error::success();
57   }
58 
59   bool IsContextSwitchRecord() const {
60     return type == PERF_RECORD_SWITCH_CPU_WIDE;
61   }
62 
63   bool IsErrorRecord() const {
64     return type == PERF_RECORD_LOST || type == PERF_RECORD_THROTTLE ||
65            type == PERF_RECORD_UNTHROTTLE || type == PERF_RECORD_LOST_SAMPLES;
66   }
67 };
68 /// \}
69 
70 /// Record found in the perf_event context switch traces. It might contain
71 /// additional fields in memory, but header.size should have the actual size
72 /// of the record.
73 struct PerfContextSwitchRecord {
74   struct perf_event_header header;
75   uint32_t next_prev_pid;
76   uint32_t next_prev_tid;
77   uint32_t pid, tid;
78   uint64_t time_in_nanos;
79 
80   bool IsOut() const { return header.misc & PERF_RECORD_MISC_SWITCH_OUT; }
81 };
82 
83 /// Record produced after parsing the raw context switch trace produce by
84 /// perf_event. A major difference between this struct and
85 /// PerfContextSwitchRecord is that this one uses tsc instead of nanos.
86 struct ContextSwitchRecord {
87   uint64_t tsc;
88   /// Whether the switch is in or out
89   bool is_out;
90   /// pid = 0 and tid = 0 indicate the swapper or idle process, which normally
91   /// runs after a context switch out of a normal user thread.
92   lldb::pid_t pid;
93   lldb::tid_t tid;
94 
95   bool IsOut() const { return is_out; }
96 
97   bool IsIn() const { return !is_out; }
98 };
99 
100 uint64_t ThreadContinuousExecution::GetLowestKnownTSC() const {
101   switch (variant) {
102   case Variant::Complete:
103     return tscs.complete.start;
104   case Variant::OnlyStart:
105     return tscs.only_start.start;
106   case Variant::OnlyEnd:
107     return tscs.only_end.end;
108   case Variant::HintedEnd:
109     return tscs.hinted_end.start;
110   case Variant::HintedStart:
111     return tscs.hinted_start.end;
112   }
113 }
114 
115 uint64_t ThreadContinuousExecution::GetStartTSC() const {
116   switch (variant) {
117   case Variant::Complete:
118     return tscs.complete.start;
119   case Variant::OnlyStart:
120     return tscs.only_start.start;
121   case Variant::OnlyEnd:
122     return 0;
123   case Variant::HintedEnd:
124     return tscs.hinted_end.start;
125   case Variant::HintedStart:
126     return tscs.hinted_start.hinted_start;
127   }
128 }
129 
130 uint64_t ThreadContinuousExecution::GetEndTSC() const {
131   switch (variant) {
132   case Variant::Complete:
133     return tscs.complete.end;
134   case Variant::OnlyStart:
135     return std::numeric_limits<uint64_t>::max();
136   case Variant::OnlyEnd:
137     return tscs.only_end.end;
138   case Variant::HintedEnd:
139     return tscs.hinted_end.hinted_end;
140   case Variant::HintedStart:
141     return tscs.hinted_start.end;
142   }
143 }
144 
145 ThreadContinuousExecution ThreadContinuousExecution::CreateCompleteExecution(
146     lldb::cpu_id_t cpu_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t start,
147     uint64_t end) {
148   ThreadContinuousExecution o(cpu_id, tid, pid);
149   o.variant = Variant::Complete;
150   o.tscs.complete.start = start;
151   o.tscs.complete.end = end;
152   return o;
153 }
154 
155 ThreadContinuousExecution ThreadContinuousExecution::CreateHintedStartExecution(
156     lldb::cpu_id_t cpu_id, lldb::tid_t tid, lldb::pid_t pid,
157     uint64_t hinted_start, uint64_t end) {
158   ThreadContinuousExecution o(cpu_id, tid, pid);
159   o.variant = Variant::HintedStart;
160   o.tscs.hinted_start.hinted_start = hinted_start;
161   o.tscs.hinted_start.end = end;
162   return o;
163 }
164 
165 ThreadContinuousExecution ThreadContinuousExecution::CreateHintedEndExecution(
166     lldb::cpu_id_t cpu_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t start,
167     uint64_t hinted_end) {
168   ThreadContinuousExecution o(cpu_id, tid, pid);
169   o.variant = Variant::HintedEnd;
170   o.tscs.hinted_end.start = start;
171   o.tscs.hinted_end.hinted_end = hinted_end;
172   return o;
173 }
174 
175 ThreadContinuousExecution ThreadContinuousExecution::CreateOnlyEndExecution(
176     lldb::cpu_id_t cpu_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t end) {
177   ThreadContinuousExecution o(cpu_id, tid, pid);
178   o.variant = Variant::OnlyEnd;
179   o.tscs.only_end.end = end;
180   return o;
181 }
182 
183 ThreadContinuousExecution ThreadContinuousExecution::CreateOnlyStartExecution(
184     lldb::cpu_id_t cpu_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t start) {
185   ThreadContinuousExecution o(cpu_id, tid, pid);
186   o.variant = Variant::OnlyStart;
187   o.tscs.only_start.start = start;
188   return o;
189 }
190 
191 static Error RecoverExecutionsFromConsecutiveRecords(
192     cpu_id_t cpu_id, const LinuxPerfZeroTscConversion &tsc_conversion,
193     const ContextSwitchRecord &current_record,
194     const Optional<ContextSwitchRecord> &prev_record,
195     std::function<void(const ThreadContinuousExecution &execution)>
196         on_new_execution) {
197   if (!prev_record) {
198     if (current_record.IsOut()) {
199       on_new_execution(ThreadContinuousExecution::CreateOnlyEndExecution(
200           cpu_id, current_record.tid, current_record.pid, current_record.tsc));
201     }
202     // The 'in' case will be handled later when we try to look for its end
203     return Error::success();
204   }
205 
206   const ContextSwitchRecord &prev = *prev_record;
207   if (prev.tsc >= current_record.tsc)
208     return createStringError(
209         inconvertibleErrorCode(),
210         formatv("A context switch record doesn't happen after the previous "
211                 "record. Previous TSC= {0}, current TSC = {1}.",
212                 prev.tsc, current_record.tsc));
213 
214   if (current_record.IsIn() && prev.IsIn()) {
215     // We found two consecutive ins, which means that we didn't capture
216     // the end of the previous execution.
217     on_new_execution(ThreadContinuousExecution::CreateHintedEndExecution(
218         cpu_id, prev.tid, prev.pid, prev.tsc, current_record.tsc - 1));
219   } else if (current_record.IsOut() && prev.IsOut()) {
220     // We found two consecutive outs, that means that we didn't capture
221     // the beginning of the current execution.
222     on_new_execution(ThreadContinuousExecution::CreateHintedStartExecution(
223         cpu_id, current_record.tid, current_record.pid, prev.tsc + 1,
224         current_record.tsc));
225   } else if (current_record.IsOut() && prev.IsIn()) {
226     if (current_record.pid == prev.pid && current_record.tid == prev.tid) {
227       /// A complete execution
228       on_new_execution(ThreadContinuousExecution::CreateCompleteExecution(
229           cpu_id, current_record.tid, current_record.pid, prev.tsc,
230           current_record.tsc));
231     } else {
232       // An out after the in of a different thread. The first one doesn't
233       // have an end, and the second one doesn't have a start.
234       on_new_execution(ThreadContinuousExecution::CreateHintedEndExecution(
235           cpu_id, prev.tid, prev.pid, prev.tsc, current_record.tsc - 1));
236       on_new_execution(ThreadContinuousExecution::CreateHintedStartExecution(
237           cpu_id, current_record.tid, current_record.pid, prev.tsc + 1,
238           current_record.tsc));
239     }
240   }
241   return Error::success();
242 }
243 
244 Expected<std::vector<ThreadContinuousExecution>>
245 lldb_private::trace_intel_pt::DecodePerfContextSwitchTrace(
246     ArrayRef<uint8_t> data, cpu_id_t cpu_id,
247     const LinuxPerfZeroTscConversion &tsc_conversion) {
248 
249   std::vector<ThreadContinuousExecution> executions;
250 
251   // This offset is used to create the error message in case of failures.
252   size_t offset = 0;
253 
254   auto do_decode = [&]() -> Error {
255     Optional<ContextSwitchRecord> prev_record;
256     while (offset < data.size()) {
257       const perf_event_header &perf_record =
258           *reinterpret_cast<const perf_event_header *>(data.data() + offset);
259       if (Error err = perf_record.SanityCheck())
260         return err;
261 
262       if (perf_record.IsContextSwitchRecord()) {
263         const PerfContextSwitchRecord &context_switch_record =
264             *reinterpret_cast<const PerfContextSwitchRecord *>(data.data() +
265                                                                offset);
266         ContextSwitchRecord record{
267             tsc_conversion.ToTSC(context_switch_record.time_in_nanos),
268             context_switch_record.IsOut(),
269             static_cast<lldb::pid_t>(context_switch_record.pid),
270             static_cast<lldb::tid_t>(context_switch_record.tid)};
271 
272         if (Error err = RecoverExecutionsFromConsecutiveRecords(
273                 cpu_id, tsc_conversion, record, prev_record,
274                 [&](const ThreadContinuousExecution &execution) {
275                   executions.push_back(execution);
276                 }))
277           return err;
278 
279         prev_record = record;
280       }
281       offset += perf_record.size;
282     }
283 
284     // We might have an incomplete last record
285     if (prev_record && prev_record->IsIn())
286       executions.push_back(ThreadContinuousExecution::CreateOnlyStartExecution(
287           cpu_id, prev_record->tid, prev_record->pid, prev_record->tsc));
288     return Error::success();
289   };
290 
291   if (Error err = do_decode())
292     return createStringError(inconvertibleErrorCode(),
293                              formatv("Malformed perf context switch trace for "
294                                      "cpu {0} at offset {1}. {2}",
295                                      cpu_id, offset, toString(std::move(err))));
296 
297   return executions;
298 }
299 
300 Expected<std::vector<uint8_t>>
301 lldb_private::trace_intel_pt::FilterProcessesFromContextSwitchTrace(
302     llvm::ArrayRef<uint8_t> data, const std::set<lldb::pid_t> &pids) {
303   size_t offset = 0;
304   std::vector<uint8_t> out_data;
305 
306   while (offset < data.size()) {
307     const perf_event_header &perf_record =
308         *reinterpret_cast<const perf_event_header *>(data.data() + offset);
309     if (Error err = perf_record.SanityCheck())
310       return std::move(err);
311     bool should_copy = false;
312     if (perf_record.IsContextSwitchRecord()) {
313       const PerfContextSwitchRecord &context_switch_record =
314           *reinterpret_cast<const PerfContextSwitchRecord *>(data.data() +
315                                                              offset);
316       if (pids.count(context_switch_record.pid))
317         should_copy = true;
318     } else if (perf_record.IsErrorRecord()) {
319       should_copy = true;
320     }
321 
322     if (should_copy) {
323       for (size_t i = 0; i < perf_record.size; i++) {
324         out_data.push_back(data[offset + i]);
325       }
326     }
327 
328     offset += perf_record.size;
329   }
330   return out_data;
331 }
332