1 /*
2  * Copyright © 2020-2021 Collabora, Ltd.
3  * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
4  * Author: Corentin Noël <corentin.noel@collabora.com>
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include "intel_pps_driver.h"
10 
11 #include <dirent.h>
12 #include <fcntl.h>
13 #include <math.h>
14 #include <poll.h>
15 #include <strings.h>
16 #include <sys/ioctl.h>
17 #include <unistd.h>
18 
19 #include <i915_drm.h>
20 #include <intel/perf/intel_perf_query.h>
21 
22 #include <pps/pps.h>
23 #include <pps/pps_algorithm.h>
24 
25 #include "intel_pps_perf.h"
26 
27 namespace pps
28 {
get_min_sampling_period_ns()29 uint64_t IntelDriver::get_min_sampling_period_ns()
30 {
31    return 500000;
32 }
33 
enable_counter(uint32_t counter_id)34 void IntelDriver::enable_counter(uint32_t counter_id)
35 {
36    auto &counter = counters[counter_id];
37    auto &group = groups[counter.group];
38    if (perf->query) {
39       if (perf->query->symbol_name != group.name) {
40          PPS_LOG_ERROR(
41             "Unable to enable metrics from different sets: %u "
42             "belongs to %s but %s is currently in use.",
43             counter_id,
44             perf->query->symbol_name,
45             group.name.c_str());
46          return;
47       }
48    }
49 
50    enabled_counters.emplace_back(counter);
51    if (!perf->query) {
52       perf->query = perf->find_query_by_name(group.name);
53    }
54 }
55 
enable_all_counters()56 void IntelDriver::enable_all_counters()
57 {
58    // We can only enable one metric set at a time so at least enable one.
59    for (auto &group : groups) {
60       if (group.name == "RenderBasic") {
61          for (uint32_t counter_id : group.counters) {
62             auto &counter = counters[counter_id];
63             enabled_counters.emplace_back(counter);
64          }
65 
66          perf->query = perf->find_query_by_name(group.name);
67          break;
68       }
69    }
70 }
71 
timespec_diff(timespec * begin,timespec * end)72 static uint64_t timespec_diff(timespec *begin, timespec *end)
73 {
74    return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec;
75 }
76 
77 /// @brief This function tries to correlate CPU time with GPU time
query_correlation_timestamps() const78 std::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const
79 {
80    TimestampCorrelation corr = {};
81 
82    clock_t correlation_clock_id = CLOCK_BOOTTIME;
83 
84    drm_i915_reg_read reg_read = {};
85    const uint64_t render_ring_timestamp = 0x2358;
86    reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
87 
88    constexpr size_t attempt_count = 3;
89    struct {
90       timespec cpu_ts_begin;
91       timespec cpu_ts_end;
92       uint64_t gpu_ts;
93    } attempts[attempt_count] = {};
94 
95    uint32_t best = 0;
96 
97    // Gather 3 correlations
98    for (uint32_t i = 0; i < attempt_count; i++) {
99       clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin);
100       if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, &reg_read) < 0) {
101          return std::nullopt;
102       }
103       clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end);
104 
105       attempts[i].gpu_ts = reg_read.val;
106    }
107 
108    // Now select the best
109    for (uint32_t i = 1; i < attempt_count; i++) {
110       if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) <
111          timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) {
112          best = i;
113       }
114    }
115 
116    corr.cpu_timestamp =
117       (attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) +
118       timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2;
119    corr.gpu_timestamp = attempts[best].gpu_ts;
120 
121    return corr;
122 }
123 
get_new_correlation()124 void IntelDriver::get_new_correlation()
125 {
126    // Rotate left correlations by one position so to make space at the end
127    std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end());
128 
129    // Then we overwrite the last correlation with a new one
130    if (auto corr = query_correlation_timestamps()) {
131       correlations.back() = *corr;
132    } else {
133       PPS_LOG_FATAL("Failed to get correlation timestamps");
134    }
135 }
136 
init_perfcnt()137 bool IntelDriver::init_perfcnt()
138 {
139    assert(!perf && "Intel perf should not be initialized at this point");
140 
141    perf = std::make_unique<IntelPerf>(drm_device.fd);
142 
143    for (auto &query : perf->get_queries()) {
144       // Create group
145       CounterGroup group = {};
146       group.id = groups.size();
147       group.name = query->symbol_name;
148 
149       for (int i = 0; i < query->n_counters; ++i) {
150          intel_perf_query_counter &counter = query->counters[i];
151 
152          // Create counter
153          Counter counter_desc = {};
154          counter_desc.id = counters.size();
155          counter_desc.name = counter.symbol_name;
156          counter_desc.group = group.id;
157          counter_desc.getter = [counter, query, this](
158                                   const Counter &c, const Driver &dri) -> Counter::Value {
159             switch (counter.data_type) {
160             case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
161             case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
162             case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
163                return (int64_t)counter.oa_counter_read_uint64(perf->cfg, query, &result);
164                break;
165             case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
166             case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
167                return counter.oa_counter_read_float(perf->cfg, query, &result);
168                break;
169             }
170 
171             return {};
172          };
173 
174          // Add counter id to the group
175          group.counters.emplace_back(counter_desc.id);
176 
177          // Store counter
178          counters.emplace_back(std::move(counter_desc));
179       }
180 
181       // Store group
182       groups.emplace_back(std::move(group));
183    }
184 
185    assert(groups.size() && "Failed to query groups");
186    assert(counters.size() && "Failed to query counters");
187 
188    // Clear accumulations
189    intel_perf_query_result_clear(&result);
190 
191    return true;
192 }
193 
enable_perfcnt(uint64_t sampling_period_ns)194 void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
195 {
196    this->sampling_period_ns = sampling_period_ns;
197 
198    // Fill correlations with an initial one
199    if (auto corr = query_correlation_timestamps()) {
200       correlations.fill(*corr);
201    } else {
202       PPS_LOG_FATAL("Failed to get correlation timestamps");
203    }
204 
205    if (!perf->open(sampling_period_ns)) {
206       PPS_LOG_FATAL("Failed to open intel perf");
207    }
208 }
209 
210 /// @brief Transforms the GPU timestop into a CPU timestamp equivalent
correlate_gpu_timestamp(const uint32_t gpu_ts)211 uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts)
212 {
213    auto &corr_a = correlations[0];
214    auto &corr_b = correlations[correlations.size() - 1];
215 
216    // A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts
217    uint64_t mask = 0xffffffff;
218    uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask;
219    uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask;
220 
221    // Make sure it is within the interval [a,b)
222    assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a");
223    assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b");
224 
225    uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts;
226    // Factor to convert gpu time to cpu time
227    double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) /
228       double(corr_b.gpu_timestamp - corr_a.gpu_timestamp);
229    uint64_t cpu_delta = gpu_delta * gpu_to_cpu;
230    return corr_a.cpu_timestamp + cpu_delta;
231 }
232 
disable_perfcnt()233 void IntelDriver::disable_perfcnt()
234 {
235    perf = nullptr;
236    groups.clear();
237    counters.clear();
238    enabled_counters.clear();
239 }
240 
241 struct Report {
242    uint32_t version;
243    uint32_t timestamp;
244    uint32_t id;
245 };
246 
247 /// @brief Some perf record durations can be really short
248 /// @return True if the duration is at least close to the sampling period
close_enough(uint64_t duration,uint64_t sampling_period)249 static bool close_enough(uint64_t duration, uint64_t sampling_period)
250 {
251    return duration > sampling_period - 100000;
252 }
253 
254 /// @brief Transforms the raw data received in from the driver into records
parse_perf_records(const std::vector<uint8_t> & data,const size_t byte_count)255 std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
256    const size_t byte_count)
257 {
258    std::vector<PerfRecord> records;
259    records.reserve(128);
260 
261    PerfRecord record;
262    record.reserve(512);
263 
264    const uint8_t *iter = data.data();
265    const uint8_t *end = iter + byte_count;
266 
267    uint64_t prev_cpu_timestamp = last_cpu_timestamp;
268 
269    while (iter < end) {
270       // Iterate a record at a time
271       auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
272 
273       if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
274          // Report is next to the header
275          auto report = reinterpret_cast<const Report *>(header + 1);
276          auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp);
277          auto duration = cpu_timestamp - prev_cpu_timestamp;
278 
279          // Skip perf-records that are too short by checking
280          // the distance between last report and this one
281          if (close_enough(duration, sampling_period_ns)) {
282             prev_cpu_timestamp = cpu_timestamp;
283 
284             // Add the new record to the list
285             record.resize(header->size); // Possibly 264?
286             memcpy(record.data(), iter, header->size);
287             records.emplace_back(record);
288          }
289       }
290 
291       // Go to the next record
292       iter += header->size;
293    }
294 
295    return records;
296 }
297 
298 /// @brief Read all the available data from the metric set currently in use
read_data_from_metric_set()299 void IntelDriver::read_data_from_metric_set()
300 {
301    assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
302 
303    ssize_t bytes_read = 0;
304    while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
305               metric_buffer.size() - total_bytes_read)) > 0 ||
306       errno == EINTR) {
307       total_bytes_read += std::max(ssize_t(0), bytes_read);
308 
309       // Increase size of the buffer for the next read
310       if (metric_buffer.size() / 2 < total_bytes_read) {
311          metric_buffer.resize(metric_buffer.size() * 2);
312       }
313    }
314 
315    assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
316 }
317 
dump_perfcnt()318 bool IntelDriver::dump_perfcnt()
319 {
320    if (!perf->oa_stream_ready()) {
321       return false;
322    }
323 
324    read_data_from_metric_set();
325 
326    get_new_correlation();
327 
328    auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
329    if (new_records.empty()) {
330       PPS_LOG("No new records");
331       // No new records from the GPU yet
332       return false;
333    } else {
334       PPS_LOG("Records parsed bytes: %lu", total_bytes_read);
335       // Records are parsed correctly, so we can reset the
336       // number of bytes read so far from the metric set
337       total_bytes_read = 0;
338    }
339 
340    APPEND(records, new_records);
341 
342    if (records.size() < 2) {
343       // Not enough records to accumulate
344       return false;
345    }
346 
347    return true;
348 }
349 
gpu_next()350 uint32_t IntelDriver::gpu_next()
351 {
352    if (records.size() < 2) {
353       // Not enough records to accumulate
354       return 0;
355    }
356 
357    // Get first and second
358    auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data());
359    auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data());
360 
361    intel_perf_query_result_accumulate_fields(&result,
362       &perf->query.value(),
363       &perf->devinfo,
364       record_a + 1,
365       record_b + 1,
366       false /* no_oa_accumulate */);
367 
368    // Get last timestamp
369    auto report_b = reinterpret_cast<const Report *>(record_b + 1);
370    auto gpu_timestamp = report_b->timestamp;
371 
372    // Consume first record
373    records.erase(std::begin(records), std::begin(records) + 1);
374 
375    return gpu_timestamp;
376 }
377 
cpu_next()378 uint64_t IntelDriver::cpu_next()
379 {
380    if (auto gpu_timestamp = gpu_next()) {
381       auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp);
382 
383       last_cpu_timestamp = cpu_timestamp;
384       return cpu_timestamp;
385    }
386 
387    return 0;
388 }
389 
next()390 uint64_t IntelDriver::next()
391 {
392    // Reset accumulation
393    intel_perf_query_result_clear(&result);
394    return cpu_next();
395 }
396 
397 } // namespace pps
398