1 /*
2 * Copyright © 2021 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include <perfetto.h>
25
26 #include "tu_perfetto.h"
27
28 #include "util/u_perfetto.h"
29 #include "util/hash_table.h"
30
31 #include "tu_tracepoints.h"
32 #include "tu_tracepoints_perfetto.h"
33
34 static uint32_t gpu_clock_id;
35 static uint64_t next_clock_sync_ns; /* cpu time of next clk sync */
36
37 /**
38 * The timestamp at the point where we first emitted the clock_sync..
39 * this will be a *later* timestamp that the first GPU traces (since
40 * we capture the first clock_sync from the CPU *after* the first GPU
41 * tracepoints happen). To avoid confusing perfetto we need to drop
42 * the GPU traces with timestamps before this.
43 */
44 static uint64_t sync_gpu_ts;
45
46 struct TuRenderpassIncrementalState {
47 bool was_cleared = true;
48 };
49
50 struct TuRenderpassTraits : public perfetto::DefaultDataSourceTraits {
51 using IncrementalStateType = TuRenderpassIncrementalState;
52 };
53
54 class TuRenderpassDataSource : public perfetto::DataSource<TuRenderpassDataSource, TuRenderpassTraits> {
55 public:
OnSetup(const SetupArgs &)56 void OnSetup(const SetupArgs &) override
57 {
58 // Use this callback to apply any custom configuration to your data source
59 // based on the TraceConfig in SetupArgs.
60 }
61
OnStart(const StartArgs &)62 void OnStart(const StartArgs &) override
63 {
64 // This notification can be used to initialize the GPU driver, enable
65 // counters, etc. StartArgs will contains the DataSourceDescriptor,
66 // which can be extended.
67 u_trace_perfetto_start();
68 PERFETTO_LOG("Tracing started");
69
70 /* Note: clock_id's below 128 are reserved.. for custom clock sources,
71 * using the hash of a namespaced string is the recommended approach.
72 * See: https://perfetto.dev/docs/concepts/clock-sync
73 */
74 gpu_clock_id =
75 _mesa_hash_string("org.freedesktop.mesa.freedreno") | 0x80000000;
76 }
77
OnStop(const StopArgs &)78 void OnStop(const StopArgs &) override
79 {
80 PERFETTO_LOG("Tracing stopped");
81
82 // Undo any initialization done in OnStart.
83 u_trace_perfetto_stop();
84 // TODO we should perhaps block until queued traces are flushed?
85
86 Trace([](TuRenderpassDataSource::TraceContext ctx) {
87 auto packet = ctx.NewTracePacket();
88 packet->Finalize();
89 ctx.Flush();
90 });
91 }
92 };
93
94 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
95 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
96
97 static void
send_descriptors(TuRenderpassDataSource::TraceContext & ctx,uint64_t ts_ns)98 send_descriptors(TuRenderpassDataSource::TraceContext &ctx, uint64_t ts_ns)
99 {
100 PERFETTO_LOG("Sending renderstage descriptors");
101
102 auto packet = ctx.NewTracePacket();
103
104 packet->set_timestamp(0);
105
106 auto event = packet->set_gpu_render_stage_event();
107 event->set_gpu_id(0);
108
109 auto spec = event->set_specifications();
110
111 for (unsigned i = 0; i < ARRAY_SIZE(queues); i++) {
112 auto desc = spec->add_hw_queue();
113
114 desc->set_name(queues[i].name);
115 desc->set_description(queues[i].desc);
116 }
117
118 for (unsigned i = 0; i < ARRAY_SIZE(stages); i++) {
119 auto desc = spec->add_stage();
120
121 desc->set_name(stages[i].name);
122 if (stages[i].desc)
123 desc->set_description(stages[i].desc);
124 }
125 }
126
127 static void
stage_start(struct tu_device * dev,uint64_t ts_ns,enum tu_stage_id stage)128 stage_start(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage)
129 {
130 struct tu_perfetto_state *p = tu_device_get_perfetto_state(dev);
131
132 p->start_ts[stage] = ts_ns;
133 }
134
135 typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
136
137 static void
stage_end(struct tu_device * dev,uint64_t ts_ns,enum tu_stage_id stage,uint32_t submission_id,const void * payload=nullptr,trace_payload_as_extra_func payload_as_extra=nullptr)138 stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage,
139 uint32_t submission_id, const void* payload = nullptr,
140 trace_payload_as_extra_func payload_as_extra = nullptr)
141 {
142 struct tu_perfetto_state *p = tu_device_get_perfetto_state(dev);
143
144 /* If we haven't managed to calibrate the alignment between GPU and CPU
145 * timestamps yet, then skip this trace, otherwise perfetto won't know
146 * what to do with it.
147 */
148 if (!sync_gpu_ts)
149 return;
150
151 TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
152 if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
153 send_descriptors(tctx, p->start_ts[stage]);
154 state->was_cleared = false;
155 }
156
157 auto packet = tctx.NewTracePacket();
158
159 packet->set_timestamp(p->start_ts[stage]);
160 packet->set_timestamp_clock_id(gpu_clock_id);
161
162 auto event = packet->set_gpu_render_stage_event();
163 event->set_event_id(0); // ???
164 event->set_hw_queue_id(DEFAULT_HW_QUEUE_ID);
165 event->set_duration(ts_ns - p->start_ts[stage]);
166 event->set_stage_id(stage);
167 event->set_context((uintptr_t)dev);
168 event->set_submission_id(submission_id);
169
170 if (payload && payload_as_extra) {
171 payload_as_extra(event, payload);
172 }
173 });
174 }
175
176 #ifdef __cplusplus
177 extern "C" {
178 #endif
179
180 void
tu_perfetto_init(void)181 tu_perfetto_init(void)
182 {
183 util_perfetto_init();
184
185 perfetto::DataSourceDescriptor dsd;
186 dsd.set_name("gpu.renderstages.msm");
187 TuRenderpassDataSource::Register(dsd);
188 }
189
190 static void
sync_timestamp(struct tu_device * dev)191 sync_timestamp(struct tu_device *dev)
192 {
193 uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
194 uint64_t gpu_ts = 0;
195
196 if (cpu_ts < next_clock_sync_ns)
197 return;
198
199 if (tu_device_get_timestamp(dev, &gpu_ts)) {
200 PERFETTO_ELOG("Could not sync CPU and GPU clocks");
201 return;
202 }
203
204 /* convert GPU ts into ns: */
205 gpu_ts = tu_device_ticks_to_ns(dev, gpu_ts);
206
207 TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
208 auto packet = tctx.NewTracePacket();
209
210 packet->set_timestamp(cpu_ts);
211
212 auto event = packet->set_clock_snapshot();
213
214 {
215 auto clock = event->add_clocks();
216
217 clock->set_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
218 clock->set_timestamp(cpu_ts);
219 }
220
221 {
222 auto clock = event->add_clocks();
223
224 clock->set_clock_id(gpu_clock_id);
225 clock->set_timestamp(gpu_ts);
226 }
227
228 sync_gpu_ts = gpu_ts;
229 next_clock_sync_ns = cpu_ts + 30000000;
230 });
231 }
232
233 static void
emit_submit_id(uint32_t submission_id)234 emit_submit_id(uint32_t submission_id)
235 {
236 TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
237 auto packet = tctx.NewTracePacket();
238
239 packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
240
241 auto event = packet->set_vulkan_api_event();
242 auto submit = event->set_vk_queue_submit();
243
244 submit->set_submission_id(submission_id);
245 });
246 }
247
248 void
tu_perfetto_submit(struct tu_device * dev,uint32_t submission_id)249 tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id)
250 {
251 sync_timestamp(dev);
252 emit_submit_id(submission_id);
253 }
254
255 /*
256 * Trace callbacks, called from u_trace once the timestamps from GPU have been
257 * collected.
258 */
259
260 #define CREATE_EVENT_CALLBACK(event_name, stage) \
261 void \
262 tu_start_##event_name(struct tu_device *dev, uint64_t ts_ns, \
263 const void *flush_data, \
264 const struct trace_start_##event_name *payload) \
265 { \
266 stage_start(dev, ts_ns, stage); \
267 } \
268 \
269 void \
270 tu_end_##event_name(struct tu_device *dev, uint64_t ts_ns, \
271 const void *flush_data, \
272 const struct trace_end_##event_name *payload) \
273 { \
274 auto trace_flush_data = (const struct tu_u_trace_flush_data *) flush_data; \
275 uint32_t submission_id = \
276 tu_u_trace_flush_data_get_submit_id(trace_flush_data); \
277 stage_end(dev, ts_ns, stage, submission_id, payload, \
278 (trace_payload_as_extra_func) &trace_payload_as_extra_end_##event_name);\
279 }
280
281 CREATE_EVENT_CALLBACK(render_pass, SURFACE_STAGE_ID)
282 CREATE_EVENT_CALLBACK(binning_ib, BINNING_STAGE_ID)
283 CREATE_EVENT_CALLBACK(draw_ib_gmem, GMEM_STAGE_ID)
284 CREATE_EVENT_CALLBACK(draw_ib_sysmem, BYPASS_STAGE_ID)
285 CREATE_EVENT_CALLBACK(blit, BLIT_STAGE_ID)
286 CREATE_EVENT_CALLBACK(compute, COMPUTE_STAGE_ID)
287 CREATE_EVENT_CALLBACK(gmem_clear, CLEAR_GMEM_STAGE_ID)
288 CREATE_EVENT_CALLBACK(sysmem_clear, CLEAR_SYSMEM_STAGE_ID)
289 CREATE_EVENT_CALLBACK(sysmem_clear_all, CLEAR_SYSMEM_STAGE_ID)
290 CREATE_EVENT_CALLBACK(gmem_load, GMEM_LOAD_STAGE_ID)
291 CREATE_EVENT_CALLBACK(gmem_store, GMEM_STORE_STAGE_ID)
292 CREATE_EVENT_CALLBACK(sysmem_resolve, SYSMEM_RESOLVE_STAGE_ID)
293
294 #ifdef __cplusplus
295 }
296 #endif
297