1 /*
2  * Copyright © 2021 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <perfetto.h>
25 
26 #include "tu_perfetto.h"
27 
28 #include "util/u_perfetto.h"
29 #include "util/hash_table.h"
30 
31 #include "tu_tracepoints.h"
32 #include "tu_tracepoints_perfetto.h"
33 
34 static uint32_t gpu_clock_id;
35 static uint64_t next_clock_sync_ns; /* cpu time of next clk sync */
36 
37 /**
38  * The timestamp at the point where we first emitted the clock_sync..
39  * this  will be a *later* timestamp that the first GPU traces (since
40  * we capture the first clock_sync from the CPU *after* the first GPU
41  * tracepoints happen).  To avoid confusing perfetto we need to drop
42  * the GPU traces with timestamps before this.
43  */
44 static uint64_t sync_gpu_ts;
45 
46 struct TuRenderpassIncrementalState {
47    bool was_cleared = true;
48 };
49 
50 struct TuRenderpassTraits : public perfetto::DefaultDataSourceTraits {
51    using IncrementalStateType = TuRenderpassIncrementalState;
52 };
53 
54 class TuRenderpassDataSource : public perfetto::DataSource<TuRenderpassDataSource, TuRenderpassTraits> {
55 public:
OnSetup(const SetupArgs &)56    void OnSetup(const SetupArgs &) override
57    {
58       // Use this callback to apply any custom configuration to your data source
59       // based on the TraceConfig in SetupArgs.
60    }
61 
OnStart(const StartArgs &)62    void OnStart(const StartArgs &) override
63    {
64       // This notification can be used to initialize the GPU driver, enable
65       // counters, etc. StartArgs will contains the DataSourceDescriptor,
66       // which can be extended.
67       u_trace_perfetto_start();
68       PERFETTO_LOG("Tracing started");
69 
70       /* Note: clock_id's below 128 are reserved.. for custom clock sources,
71        * using the hash of a namespaced string is the recommended approach.
72        * See: https://perfetto.dev/docs/concepts/clock-sync
73        */
74       gpu_clock_id =
75          _mesa_hash_string("org.freedesktop.mesa.freedreno") | 0x80000000;
76    }
77 
OnStop(const StopArgs &)78    void OnStop(const StopArgs &) override
79    {
80       PERFETTO_LOG("Tracing stopped");
81 
82       // Undo any initialization done in OnStart.
83       u_trace_perfetto_stop();
84       // TODO we should perhaps block until queued traces are flushed?
85 
86       Trace([](TuRenderpassDataSource::TraceContext ctx) {
87          auto packet = ctx.NewTracePacket();
88          packet->Finalize();
89          ctx.Flush();
90       });
91    }
92 };
93 
94 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
95 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
96 
97 static void
send_descriptors(TuRenderpassDataSource::TraceContext & ctx,uint64_t ts_ns)98 send_descriptors(TuRenderpassDataSource::TraceContext &ctx, uint64_t ts_ns)
99 {
100    PERFETTO_LOG("Sending renderstage descriptors");
101 
102    auto packet = ctx.NewTracePacket();
103 
104    packet->set_timestamp(0);
105 
106    auto event = packet->set_gpu_render_stage_event();
107    event->set_gpu_id(0);
108 
109    auto spec = event->set_specifications();
110 
111    for (unsigned i = 0; i < ARRAY_SIZE(queues); i++) {
112       auto desc = spec->add_hw_queue();
113 
114       desc->set_name(queues[i].name);
115       desc->set_description(queues[i].desc);
116    }
117 
118    for (unsigned i = 0; i < ARRAY_SIZE(stages); i++) {
119       auto desc = spec->add_stage();
120 
121       desc->set_name(stages[i].name);
122       if (stages[i].desc)
123          desc->set_description(stages[i].desc);
124    }
125 }
126 
127 static void
stage_start(struct tu_device * dev,uint64_t ts_ns,enum tu_stage_id stage)128 stage_start(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage)
129 {
130    struct tu_perfetto_state *p = tu_device_get_perfetto_state(dev);
131 
132    p->start_ts[stage] = ts_ns;
133 }
134 
135 typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
136 
137 static void
stage_end(struct tu_device * dev,uint64_t ts_ns,enum tu_stage_id stage,uint32_t submission_id,const void * payload=nullptr,trace_payload_as_extra_func payload_as_extra=nullptr)138 stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage,
139           uint32_t submission_id, const void* payload = nullptr,
140           trace_payload_as_extra_func payload_as_extra = nullptr)
141 {
142    struct tu_perfetto_state *p = tu_device_get_perfetto_state(dev);
143 
144    /* If we haven't managed to calibrate the alignment between GPU and CPU
145     * timestamps yet, then skip this trace, otherwise perfetto won't know
146     * what to do with it.
147     */
148    if (!sync_gpu_ts)
149       return;
150 
151    TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
152       if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
153          send_descriptors(tctx, p->start_ts[stage]);
154          state->was_cleared = false;
155       }
156 
157       auto packet = tctx.NewTracePacket();
158 
159       packet->set_timestamp(p->start_ts[stage]);
160       packet->set_timestamp_clock_id(gpu_clock_id);
161 
162       auto event = packet->set_gpu_render_stage_event();
163       event->set_event_id(0); // ???
164       event->set_hw_queue_id(DEFAULT_HW_QUEUE_ID);
165       event->set_duration(ts_ns - p->start_ts[stage]);
166       event->set_stage_id(stage);
167       event->set_context((uintptr_t)dev);
168       event->set_submission_id(submission_id);
169 
170       if (payload && payload_as_extra) {
171          payload_as_extra(event, payload);
172       }
173    });
174 }
175 
176 #ifdef __cplusplus
177 extern "C" {
178 #endif
179 
180 void
tu_perfetto_init(void)181 tu_perfetto_init(void)
182 {
183    util_perfetto_init();
184 
185    perfetto::DataSourceDescriptor dsd;
186    dsd.set_name("gpu.renderstages.msm");
187    TuRenderpassDataSource::Register(dsd);
188 }
189 
190 static void
sync_timestamp(struct tu_device * dev)191 sync_timestamp(struct tu_device *dev)
192 {
193    uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
194    uint64_t gpu_ts = 0;
195 
196    if (cpu_ts < next_clock_sync_ns)
197       return;
198 
199     if (tu_device_get_timestamp(dev, &gpu_ts)) {
200       PERFETTO_ELOG("Could not sync CPU and GPU clocks");
201       return;
202     }
203 
204    /* convert GPU ts into ns: */
205    gpu_ts = tu_device_ticks_to_ns(dev, gpu_ts);
206 
207    TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
208       auto packet = tctx.NewTracePacket();
209 
210       packet->set_timestamp(cpu_ts);
211 
212       auto event = packet->set_clock_snapshot();
213 
214       {
215          auto clock = event->add_clocks();
216 
217          clock->set_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
218          clock->set_timestamp(cpu_ts);
219       }
220 
221       {
222          auto clock = event->add_clocks();
223 
224          clock->set_clock_id(gpu_clock_id);
225          clock->set_timestamp(gpu_ts);
226       }
227 
228       sync_gpu_ts = gpu_ts;
229       next_clock_sync_ns = cpu_ts + 30000000;
230    });
231 }
232 
233 static void
emit_submit_id(uint32_t submission_id)234 emit_submit_id(uint32_t submission_id)
235 {
236    TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
237       auto packet = tctx.NewTracePacket();
238 
239       packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
240 
241       auto event = packet->set_vulkan_api_event();
242       auto submit = event->set_vk_queue_submit();
243 
244       submit->set_submission_id(submission_id);
245    });
246 }
247 
248 void
tu_perfetto_submit(struct tu_device * dev,uint32_t submission_id)249 tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id)
250 {
251    sync_timestamp(dev);
252    emit_submit_id(submission_id);
253 }
254 
255 /*
256  * Trace callbacks, called from u_trace once the timestamps from GPU have been
257  * collected.
258  */
259 
260 #define CREATE_EVENT_CALLBACK(event_name, stage)                              \
261 void                                                                          \
262 tu_start_##event_name(struct tu_device *dev, uint64_t ts_ns,                  \
263                    const void *flush_data,                                    \
264                    const struct trace_start_##event_name *payload)            \
265 {                                                                             \
266    stage_start(dev, ts_ns, stage);                                            \
267 }                                                                             \
268                                                                               \
269 void                                                                          \
270 tu_end_##event_name(struct tu_device *dev, uint64_t ts_ns,                    \
271                    const void *flush_data,                                    \
272                    const struct trace_end_##event_name *payload)              \
273 {                                                                             \
274    auto trace_flush_data = (const struct tu_u_trace_flush_data *) flush_data; \
275    uint32_t submission_id =                                                   \
276       tu_u_trace_flush_data_get_submit_id(trace_flush_data);                  \
277    stage_end(dev, ts_ns, stage, submission_id, payload,                       \
278       (trace_payload_as_extra_func) &trace_payload_as_extra_end_##event_name);\
279 }
280 
281 CREATE_EVENT_CALLBACK(render_pass, SURFACE_STAGE_ID)
282 CREATE_EVENT_CALLBACK(binning_ib, BINNING_STAGE_ID)
283 CREATE_EVENT_CALLBACK(draw_ib_gmem, GMEM_STAGE_ID)
284 CREATE_EVENT_CALLBACK(draw_ib_sysmem, BYPASS_STAGE_ID)
285 CREATE_EVENT_CALLBACK(blit, BLIT_STAGE_ID)
286 CREATE_EVENT_CALLBACK(compute, COMPUTE_STAGE_ID)
287 CREATE_EVENT_CALLBACK(gmem_clear, CLEAR_GMEM_STAGE_ID)
288 CREATE_EVENT_CALLBACK(sysmem_clear, CLEAR_SYSMEM_STAGE_ID)
289 CREATE_EVENT_CALLBACK(sysmem_clear_all, CLEAR_SYSMEM_STAGE_ID)
290 CREATE_EVENT_CALLBACK(gmem_load, GMEM_LOAD_STAGE_ID)
291 CREATE_EVENT_CALLBACK(gmem_store, GMEM_STORE_STAGE_ID)
292 CREATE_EVENT_CALLBACK(sysmem_resolve, SYSMEM_RESOLVE_STAGE_ID)
293 
294 #ifdef __cplusplus
295 }
296 #endif
297