1 /*
2 * Copyright © 2020-2021 Collabora, Ltd.
3 * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
4 * Author: Corentin Noël <corentin.noel@collabora.com>
5 *
6 * SPDX-License-Identifier: MIT
7 */
8
9 #include "intel_pps_driver.h"
10
11 #include <dirent.h>
12 #include <fcntl.h>
13 #include <math.h>
14 #include <poll.h>
15 #include <strings.h>
16 #include <sys/ioctl.h>
17 #include <unistd.h>
18
19 #include <i915_drm.h>
20 #include <intel/perf/intel_perf_query.h>
21
22 #include <pps/pps.h>
23 #include <pps/pps_algorithm.h>
24
25 #include "intel_pps_perf.h"
26
27 namespace pps
28 {
get_min_sampling_period_ns()29 uint64_t IntelDriver::get_min_sampling_period_ns()
30 {
31 return 500000;
32 }
33
enable_counter(uint32_t counter_id)34 void IntelDriver::enable_counter(uint32_t counter_id)
35 {
36 auto &counter = counters[counter_id];
37 auto &group = groups[counter.group];
38 if (perf->query) {
39 if (perf->query->symbol_name != group.name) {
40 PPS_LOG_ERROR(
41 "Unable to enable metrics from different sets: %u "
42 "belongs to %s but %s is currently in use.",
43 counter_id,
44 perf->query->symbol_name,
45 group.name.c_str());
46 return;
47 }
48 }
49
50 enabled_counters.emplace_back(counter);
51 if (!perf->query) {
52 perf->query = perf->find_query_by_name(group.name);
53 }
54 }
55
enable_all_counters()56 void IntelDriver::enable_all_counters()
57 {
58 // We can only enable one metric set at a time so at least enable one.
59 for (auto &group : groups) {
60 if (group.name == "RenderBasic") {
61 for (uint32_t counter_id : group.counters) {
62 auto &counter = counters[counter_id];
63 enabled_counters.emplace_back(counter);
64 }
65
66 perf->query = perf->find_query_by_name(group.name);
67 break;
68 }
69 }
70 }
71
timespec_diff(timespec * begin,timespec * end)72 static uint64_t timespec_diff(timespec *begin, timespec *end)
73 {
74 return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec;
75 }
76
77 /// @brief This function tries to correlate CPU time with GPU time
query_correlation_timestamps() const78 std::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const
79 {
80 TimestampCorrelation corr = {};
81
82 clock_t correlation_clock_id = CLOCK_BOOTTIME;
83
84 drm_i915_reg_read reg_read = {};
85 const uint64_t render_ring_timestamp = 0x2358;
86 reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
87
88 constexpr size_t attempt_count = 3;
89 struct {
90 timespec cpu_ts_begin;
91 timespec cpu_ts_end;
92 uint64_t gpu_ts;
93 } attempts[attempt_count] = {};
94
95 uint32_t best = 0;
96
97 // Gather 3 correlations
98 for (uint32_t i = 0; i < attempt_count; i++) {
99 clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin);
100 if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) {
101 return std::nullopt;
102 }
103 clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end);
104
105 attempts[i].gpu_ts = reg_read.val;
106 }
107
108 // Now select the best
109 for (uint32_t i = 1; i < attempt_count; i++) {
110 if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) <
111 timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) {
112 best = i;
113 }
114 }
115
116 corr.cpu_timestamp =
117 (attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) +
118 timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2;
119 corr.gpu_timestamp = attempts[best].gpu_ts;
120
121 return corr;
122 }
123
get_new_correlation()124 void IntelDriver::get_new_correlation()
125 {
126 // Rotate left correlations by one position so to make space at the end
127 std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end());
128
129 // Then we overwrite the last correlation with a new one
130 if (auto corr = query_correlation_timestamps()) {
131 correlations.back() = *corr;
132 } else {
133 PPS_LOG_FATAL("Failed to get correlation timestamps");
134 }
135 }
136
init_perfcnt()137 bool IntelDriver::init_perfcnt()
138 {
139 assert(!perf && "Intel perf should not be initialized at this point");
140
141 perf = std::make_unique<IntelPerf>(drm_device.fd);
142
143 for (auto &query : perf->get_queries()) {
144 // Create group
145 CounterGroup group = {};
146 group.id = groups.size();
147 group.name = query->symbol_name;
148
149 for (int i = 0; i < query->n_counters; ++i) {
150 intel_perf_query_counter &counter = query->counters[i];
151
152 // Create counter
153 Counter counter_desc = {};
154 counter_desc.id = counters.size();
155 counter_desc.name = counter.symbol_name;
156 counter_desc.group = group.id;
157 counter_desc.getter = [counter, query, this](
158 const Counter &c, const Driver &dri) -> Counter::Value {
159 switch (counter.data_type) {
160 case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
161 case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
162 case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
163 return (int64_t)counter.oa_counter_read_uint64(perf->cfg, query, &result);
164 break;
165 case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
166 case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
167 return counter.oa_counter_read_float(perf->cfg, query, &result);
168 break;
169 }
170
171 return {};
172 };
173
174 // Add counter id to the group
175 group.counters.emplace_back(counter_desc.id);
176
177 // Store counter
178 counters.emplace_back(std::move(counter_desc));
179 }
180
181 // Store group
182 groups.emplace_back(std::move(group));
183 }
184
185 assert(groups.size() && "Failed to query groups");
186 assert(counters.size() && "Failed to query counters");
187
188 // Clear accumulations
189 intel_perf_query_result_clear(&result);
190
191 return true;
192 }
193
enable_perfcnt(uint64_t sampling_period_ns)194 void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
195 {
196 this->sampling_period_ns = sampling_period_ns;
197
198 // Fill correlations with an initial one
199 if (auto corr = query_correlation_timestamps()) {
200 correlations.fill(*corr);
201 } else {
202 PPS_LOG_FATAL("Failed to get correlation timestamps");
203 }
204
205 if (!perf->open(sampling_period_ns)) {
206 PPS_LOG_FATAL("Failed to open intel perf");
207 }
208 }
209
210 /// @brief Transforms the GPU timestop into a CPU timestamp equivalent
correlate_gpu_timestamp(const uint32_t gpu_ts)211 uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts)
212 {
213 auto &corr_a = correlations[0];
214 auto &corr_b = correlations[correlations.size() - 1];
215
216 // A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts
217 uint64_t mask = 0xffffffff;
218 uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask;
219 uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask;
220
221 // Make sure it is within the interval [a,b)
222 assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a");
223 assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b");
224
225 uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts;
226 // Factor to convert gpu time to cpu time
227 double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) /
228 double(corr_b.gpu_timestamp - corr_a.gpu_timestamp);
229 uint64_t cpu_delta = gpu_delta * gpu_to_cpu;
230 return corr_a.cpu_timestamp + cpu_delta;
231 }
232
disable_perfcnt()233 void IntelDriver::disable_perfcnt()
234 {
235 perf = nullptr;
236 groups.clear();
237 counters.clear();
238 enabled_counters.clear();
239 }
240
241 struct Report {
242 uint32_t version;
243 uint32_t timestamp;
244 uint32_t id;
245 };
246
247 /// @brief Some perf record durations can be really short
248 /// @return True if the duration is at least close to the sampling period
close_enough(uint64_t duration,uint64_t sampling_period)249 static bool close_enough(uint64_t duration, uint64_t sampling_period)
250 {
251 return duration > sampling_period - 100000;
252 }
253
254 /// @brief Transforms the raw data received in from the driver into records
parse_perf_records(const std::vector<uint8_t> & data,const size_t byte_count)255 std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
256 const size_t byte_count)
257 {
258 std::vector<PerfRecord> records;
259 records.reserve(128);
260
261 PerfRecord record;
262 record.reserve(512);
263
264 const uint8_t *iter = data.data();
265 const uint8_t *end = iter + byte_count;
266
267 uint64_t prev_cpu_timestamp = last_cpu_timestamp;
268
269 while (iter < end) {
270 // Iterate a record at a time
271 auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
272
273 if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
274 // Report is next to the header
275 auto report = reinterpret_cast<const Report *>(header + 1);
276 auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp);
277 auto duration = cpu_timestamp - prev_cpu_timestamp;
278
279 // Skip perf-records that are too short by checking
280 // the distance between last report and this one
281 if (close_enough(duration, sampling_period_ns)) {
282 prev_cpu_timestamp = cpu_timestamp;
283
284 // Add the new record to the list
285 record.resize(header->size); // Possibly 264?
286 memcpy(record.data(), iter, header->size);
287 records.emplace_back(record);
288 }
289 }
290
291 // Go to the next record
292 iter += header->size;
293 }
294
295 return records;
296 }
297
298 /// @brief Read all the available data from the metric set currently in use
read_data_from_metric_set()299 void IntelDriver::read_data_from_metric_set()
300 {
301 assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
302
303 ssize_t bytes_read = 0;
304 while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
305 metric_buffer.size() - total_bytes_read)) > 0 ||
306 errno == EINTR) {
307 total_bytes_read += std::max(ssize_t(0), bytes_read);
308
309 // Increase size of the buffer for the next read
310 if (metric_buffer.size() / 2 < total_bytes_read) {
311 metric_buffer.resize(metric_buffer.size() * 2);
312 }
313 }
314
315 assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
316 }
317
dump_perfcnt()318 bool IntelDriver::dump_perfcnt()
319 {
320 if (!perf->oa_stream_ready()) {
321 return false;
322 }
323
324 read_data_from_metric_set();
325
326 get_new_correlation();
327
328 auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
329 if (new_records.empty()) {
330 PPS_LOG("No new records");
331 // No new records from the GPU yet
332 return false;
333 } else {
334 PPS_LOG("Records parsed bytes: %lu", total_bytes_read);
335 // Records are parsed correctly, so we can reset the
336 // number of bytes read so far from the metric set
337 total_bytes_read = 0;
338 }
339
340 APPEND(records, new_records);
341
342 if (records.size() < 2) {
343 // Not enough records to accumulate
344 return false;
345 }
346
347 return true;
348 }
349
gpu_next()350 uint32_t IntelDriver::gpu_next()
351 {
352 if (records.size() < 2) {
353 // Not enough records to accumulate
354 return 0;
355 }
356
357 // Get first and second
358 auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data());
359 auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data());
360
361 intel_perf_query_result_accumulate_fields(&result,
362 &perf->query.value(),
363 &perf->devinfo,
364 record_a + 1,
365 record_b + 1,
366 false /* no_oa_accumulate */);
367
368 // Get last timestamp
369 auto report_b = reinterpret_cast<const Report *>(record_b + 1);
370 auto gpu_timestamp = report_b->timestamp;
371
372 // Consume first record
373 records.erase(std::begin(records), std::begin(records) + 1);
374
375 return gpu_timestamp;
376 }
377
cpu_next()378 uint64_t IntelDriver::cpu_next()
379 {
380 if (auto gpu_timestamp = gpu_next()) {
381 auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp);
382
383 last_cpu_timestamp = cpu_timestamp;
384 return cpu_timestamp;
385 }
386
387 return 0;
388 }
389
next()390 uint64_t IntelDriver::next()
391 {
392 // Reset accumulation
393 intel_perf_query_result_clear(&result);
394 return cpu_next();
395 }
396
397 } // namespace pps
398