1 /*
2  * Copyright © 2020-2021 Collabora, Ltd.
3  * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
4  * Author: Corentin Noël <corentin.noel@collabora.com>
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include "intel_pps_driver.h"
10 
11 #include <dirent.h>
12 #include <fcntl.h>
13 #include <math.h>
14 #include <poll.h>
15 #include <strings.h>
16 #include <sys/ioctl.h>
17 #include <unistd.h>
18 
19 #include "drm-uapi/i915_drm.h"
20 
21 #include "common/intel_gem.h"
22 #include "dev/intel_device_info.h"
23 #include "perf/intel_perf.h"
24 #include "perf/intel_perf_query.h"
25 
26 #include <pps/pps.h>
27 #include <pps/pps_algorithm.h>
28 
29 #include "intel_pps_perf.h"
30 #include "intel_pps_priv.h"
31 
32 namespace pps
33 {
34 
35 // The HW sampling period is programmed using period_exponent following this
36 // formula:
37 //    sample_period = timestamp_period * 2^(period_exponent + 1)
38 // So our minimum sampling period is twice the timestamp period
39 
get_min_sampling_period_ns()40 uint64_t IntelDriver::get_min_sampling_period_ns()
41 {
42    return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull;
43 }
44 
IntelDriver()45 IntelDriver::IntelDriver()
46 {
47 }
48 
~IntelDriver()49 IntelDriver::~IntelDriver()
50 {
51 }
52 
enable_counter(uint32_t counter_id)53 void IntelDriver::enable_counter(uint32_t counter_id)
54 {
55    auto &counter = counters[counter_id];
56 
57    enabled_counters.emplace_back(counter);
58 }
59 
enable_all_counters()60 void IntelDriver::enable_all_counters()
61 {
62    // We should only have one group
63    assert(groups.size() == 1);
64    for (uint32_t counter_id : groups[0].counters) {
65       auto &counter = counters[counter_id];
66       enabled_counters.emplace_back(counter);
67    }
68 }
69 
init_perfcnt()70 bool IntelDriver::init_perfcnt()
71 {
72    /* Note: clock_id's below 128 are reserved.. for custom clock sources,
73     * using the hash of a namespaced string is the recommended approach.
74     * See: https://perfetto.dev/docs/concepts/clock-sync
75     */
76    this->clock_id = intel_pps_clock_id(drm_device.gpu_num);
77 
78    assert(!perf && "Intel perf should not be initialized at this point");
79 
80    perf = std::make_unique<IntelPerf>(drm_device.fd);
81 
82    const char *metric_set_name = getenv("INTEL_PERFETTO_METRIC_SET");
83 
84    struct intel_perf_query_info *default_query = nullptr;
85    selected_query = nullptr;
86    for (auto &query : perf->get_queries()) {
87       if (!strcmp(query->symbol_name, "RenderBasic"))
88          default_query = query;
89       if (metric_set_name && !strcmp(query->symbol_name, metric_set_name))
90          selected_query = query;
91    }
92 
93    assert(default_query);
94 
95    if (!selected_query) {
96       if (metric_set_name) {
97          PPS_LOG_ERROR("Available metric sets:");
98          for (auto &query : perf->get_queries())
99             PPS_LOG_ERROR("   %s", query->symbol_name);
100          PPS_LOG_FATAL("Metric set '%s' not available.", metric_set_name);
101       }
102       selected_query = default_query;
103    }
104 
105    PPS_LOG("Using metric set '%s': %s",
106            selected_query->symbol_name, selected_query->name);
107 
108    // Create group
109    CounterGroup group = {};
110    group.id = groups.size();
111    group.name = selected_query->symbol_name;
112 
113    for (int i = 0; i < selected_query->n_counters; ++i) {
114       intel_perf_query_counter &counter = selected_query->counters[i];
115 
116       // Create counter
117       Counter counter_desc = {};
118       counter_desc.id = counters.size();
119       counter_desc.name = counter.symbol_name;
120       counter_desc.group = group.id;
121       counter_desc.getter = [counter, this](
122          const Counter &c, const Driver &dri) -> Counter::Value {
123          switch (counter.data_type) {
124          case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
125          case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
126          case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
127             return (int64_t)counter.oa_counter_read_uint64(perf->cfg,
128                                                            selected_query,
129                                                            &perf->result);
130             break;
131          case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
132          case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
133             return counter.oa_counter_read_float(perf->cfg,
134                                                  selected_query,
135                                                  &perf->result);
136             break;
137          }
138 
139          return {};
140       };
141 
142       // Add counter id to the group
143       group.counters.emplace_back(counter_desc.id);
144 
145       // Store counter
146       counters.emplace_back(std::move(counter_desc));
147    }
148 
149    // Store group
150    groups.emplace_back(std::move(group));
151 
152    assert(counters.size() && "Failed to query counters");
153 
154    // Clear accumulations
155    intel_perf_query_result_clear(&perf->result);
156 
157    return true;
158 }
159 
enable_perfcnt(uint64_t sampling_period_ns)160 void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
161 {
162    this->sampling_period_ns = sampling_period_ns;
163 
164    if (!perf->open(sampling_period_ns, selected_query)) {
165       PPS_LOG_FATAL("Failed to open intel perf");
166    }
167 }
168 
disable_perfcnt()169 void IntelDriver::disable_perfcnt()
170 {
171    gpu_timestamp_udw = 0;
172    perf = nullptr;
173    groups.clear();
174    counters.clear();
175    enabled_counters.clear();
176 }
177 
178 /// @brief Some perf record durations can be really short
179 /// @return True if the duration is at least close to the sampling period
close_enough(uint64_t duration,uint64_t sampling_period)180 static bool close_enough(uint64_t duration, uint64_t sampling_period)
181 {
182    return duration > sampling_period - 100000;
183 }
184 
185 /// @brief Transforms the raw data received in from the driver into records
parse_perf_records(const std::vector<uint8_t> & data,const size_t byte_count)186 std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
187    const size_t byte_count)
188 {
189    std::vector<PerfRecord> records;
190    records.reserve(128);
191 
192    PerfRecord record;
193    record.data.reserve(512);
194 
195    const uint8_t *iter = data.data();
196    const uint8_t *end = iter + byte_count;
197 
198    uint64_t prev_gpu_timestamp = last_gpu_timestamp;
199 
200    while (iter < end) {
201       // Iterate a record at a time
202       auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
203 
204       if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
205          // Report is next to the header
206          const uint32_t *report = reinterpret_cast<const uint32_t *>(header + 1);
207          uint64_t gpu_timestamp_ldw =
208             intel_perf_report_timestamp(selected_query, report);
209 
210          /* Our HW only provides us with the lower 32 bits of the 36bits
211           * timestamp counter value. If we haven't captured the top bits yet,
212           * do it now. If we see a roll over the lower 32bits capture it
213           * again.
214           */
215          if (gpu_timestamp_udw == 0 || (gpu_timestamp_udw + gpu_timestamp_ldw) < last_gpu_timestamp)
216             gpu_timestamp_udw = intel_read_gpu_timestamp(drm_device.fd) & 0xffffffff00000000;
217 
218          uint64_t gpu_timestamp = gpu_timestamp_udw + gpu_timestamp_ldw;
219 
220          auto duration = intel_device_info_timebase_scale(&perf->devinfo,
221                                                           gpu_timestamp - prev_gpu_timestamp);
222 
223          // Skip perf-records that are too short by checking
224          // the distance between last report and this one
225          if (close_enough(duration, sampling_period_ns)) {
226             prev_gpu_timestamp = gpu_timestamp;
227 
228             // Add the new record to the list
229             record.timestamp = gpu_timestamp;
230             record.data.resize(header->size); // Possibly 264?
231             memcpy(record.data.data(), iter, header->size);
232             records.emplace_back(record);
233          }
234       }
235 
236       // Go to the next record
237       iter += header->size;
238    }
239 
240    return records;
241 }
242 
243 /// @brief Read all the available data from the metric set currently in use
read_data_from_metric_set()244 void IntelDriver::read_data_from_metric_set()
245 {
246    assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
247 
248    ssize_t bytes_read = 0;
249    while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
250               metric_buffer.size() - total_bytes_read)) > 0 ||
251       errno == EINTR) {
252       total_bytes_read += std::max(ssize_t(0), bytes_read);
253 
254       // Increase size of the buffer for the next read
255       if (metric_buffer.size() / 2 < total_bytes_read) {
256          metric_buffer.resize(metric_buffer.size() * 2);
257       }
258    }
259 
260    assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
261 }
262 
dump_perfcnt()263 bool IntelDriver::dump_perfcnt()
264 {
265    if (!perf->oa_stream_ready()) {
266       return false;
267    }
268 
269    read_data_from_metric_set();
270 
271    auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
272    if (new_records.empty()) {
273       // No new records from the GPU yet
274       return false;
275    } else {
276       // Records are parsed correctly, so we can reset the
277       // number of bytes read so far from the metric set
278       total_bytes_read = 0;
279    }
280 
281    APPEND(records, new_records);
282 
283    if (records.size() < 2) {
284       // Not enough records to accumulate
285       return false;
286    }
287 
288    return true;
289 }
290 
gpu_next()291 uint64_t IntelDriver::gpu_next()
292 {
293    if (records.size() < 2) {
294       // Not enough records to accumulate
295       return 0;
296    }
297 
298    // Get first and second
299    auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data.data());
300    auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data.data());
301 
302    intel_perf_query_result_accumulate_fields(&perf->result,
303                                              selected_query,
304                                              &perf->devinfo,
305                                              record_a + 1,
306                                              record_b + 1,
307                                              false /* no_oa_accumulate */);
308 
309    // Get last timestamp
310    auto gpu_timestamp = records[1].timestamp;
311 
312    // Consume first record
313    records.erase(std::begin(records), std::begin(records) + 1);
314 
315    return intel_device_info_timebase_scale(&perf->devinfo, gpu_timestamp);
316 }
317 
next()318 uint64_t IntelDriver::next()
319 {
320    // Reset accumulation
321    intel_perf_query_result_clear(&perf->result);
322    return gpu_next();
323 }
324 
gpu_clock_id() const325 uint32_t IntelDriver::gpu_clock_id() const
326 {
327    return this->clock_id;
328 }
329 
gpu_timestamp() const330 uint64_t IntelDriver::gpu_timestamp() const
331 {
332    return intel_device_info_timebase_scale(&perf->devinfo,
333                                            intel_read_gpu_timestamp(drm_device.fd));
334 }
335 
336 } // namespace pps
337