1 /*
2 * Copyright © 2020 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef _U_TRACE_H
25 #define _U_TRACE_H
26
27 #include <stdbool.h>
28 #include <stdint.h>
29 #include <stdio.h>
30
31 #include "util/u_queue.h"
32
33 #ifdef __cplusplus
34 extern "C" {
35 #endif
36
37 /* A trace mechanism (very) loosely inspired by the linux kernel tracepoint
38 * mechanism, in that it allows for defining driver specific (or common)
39 * tracepoints, which generate 'trace_$name()' functions that can be
40 * called at various points in commandstream emit.
41 *
42 * Currently a printf backend is implemented, but the expectation is to
43 * also implement a perfetto backend for shipping out traces to a tool like
44 * AGI.
45 *
46 * Notable differences:
47 *
48 * - GPU timestamps! A driver provided callback is used to emit timestamps
49 * to a buffer. At a later point in time (when stalling to wait for the
50 * GPU is not required), the timestamps are re-united with the trace
51 * payload. This makes the trace mechanism suitable for profiling.
52 *
53 * - Instead of a systemwide trace ringbuffer, buffering of un-retired
54 * tracepoints is split into two stages. Traces are emitted to a
55 * 'u_trace' instance, and at a later time flushed to a 'u_trace_context'
56 * instance. This avoids the requirement that commandstream containing
57 * tracepoints is emitted in the same order as it is generated.
58 *
59 * If the hw has multiple parallel "engines" (for example, 3d/blit/compute)
60 * then a `u_trace_context` per-engine should be used.
61 *
62 * - Unlike kernel tracepoints, u_trace tracepoints are defined in py
63 * from which header and src files are generated. Since we already have
64 * a build dependency on python+mako, this gives more flexibility than
65 * clunky preprocessor macro magic.
66 *
67 */
68
69 struct u_trace_context;
70 struct u_trace;
71 struct u_trace_chunk;
72
73 /**
74 * Special reserved value to indicate that no timestamp was captured,
75 * and that the timestamp of the previous trace should be reused.
76 */
77 #define U_TRACE_NO_TIMESTAMP ((uint64_t)0)
78
79 /**
80 * Driver provided callback to create a timestamp buffer which will be
81 * read by u_trace_read_ts function.
82 */
83 typedef void* (*u_trace_create_ts_buffer)(struct u_trace_context *utctx,
84 uint32_t timestamps_count);
85
86 /**
87 * Driver provided callback to delete a timestamp buffer.
88 */
89 typedef void (*u_trace_delete_ts_buffer)(struct u_trace_context *utctx,
90 void *timestamps);
91
92 /**
93 * Driver provided callback to emit commands into the soecified command
94 * stream to capture a 64b timestamp into the specified timestamps buffer,
95 * at the specified index.
96 *
97 * The hw counter that the driver records should be something that runs at
98 * a fixed rate, even as the GPU freq changes. The same source used for
99 * GL_TIMESTAMP queries should be appropriate.
100 */
101 typedef void (*u_trace_record_ts)(struct u_trace *ut, void *cs,
102 void *timestamps, unsigned idx,
103 bool end_of_pipe);
104
105 /**
106 * Driver provided callback to read back a previously recorded timestamp.
107 * If necessary, this should block until the GPU has finished writing back
108 * the timestamps. (The timestamps will be read back in order, so it is
109 * safe to only synchronize on idx==0.)
110 *
111 * flush_data is data provided by the driver via u_trace_flush.
112 *
113 * The returned timestamp should be in units of nanoseconds. The same
114 * timebase as GL_TIMESTAMP queries should be used.
115 *
116 * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate
117 * that no timestamp was captured and the timestamp from the previous trace
118 * will be re-used. (The first trace in the u_trace buf may not do this.)
119 * This allows the driver to detect cases where multiple tracepoints are
120 * emitted with no other intervening cmdstream, to avoid pointlessly
121 * capturing the same timestamp multiple times in a row.
122 */
123 typedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx,
124 void *timestamps, unsigned idx, void *flush_data);
125
126 /**
127 * Driver provided callback to delete flush data.
128 */
129 typedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx,
130 void *flush_data);
131
132 /**
133 * The trace context provides tracking for "in-flight" traces, once the
134 * cmdstream that records timestamps has been flushed.
135 */
136 struct u_trace_context {
137 void *pctx;
138
139 u_trace_create_ts_buffer create_timestamp_buffer;
140 u_trace_delete_ts_buffer delete_timestamp_buffer;
141 u_trace_record_ts record_timestamp;
142 u_trace_read_ts read_timestamp;
143 u_trace_delete_flush_data delete_flush_data;
144
145 FILE *out;
146
147 /* Once u_trace_flush() is called u_trace_chunk's are queued up to
148 * render tracepoints on a queue. The per-chunk queue jobs block until
149 * timestamps are available.
150 */
151 struct util_queue queue;
152
153 #ifdef HAVE_PERFETTO
154 /* node in global list of trace contexts. */
155 struct list_head node;
156 #endif
157
158 /* State to accumulate time across N chunks associated with a single
159 * batch (u_trace).
160 */
161 uint64_t last_time_ns;
162 uint64_t first_time_ns;
163
164 uint32_t frame_nr;
165
166 /* list of unprocessed trace chunks in fifo order: */
167 struct list_head flushed_trace_chunks;
168 };
169
170 /**
171 * The u_trace ptr is passed as the first arg to generated tracepoints.
172 * It provides buffering for tracepoint payload until the corresponding
173 * driver cmdstream containing the emitted commands to capture is
174 * flushed.
175 *
176 * Individual tracepoints emitted to u_trace are expected to be "executed"
177 * (ie. timestamp captured) in FIFO order with respect to other tracepoints
178 * emitted to the same u_trace. But the order WRT other u_trace instances
179 * is undefined util u_trace_flush().
180 */
181 struct u_trace {
182 struct u_trace_context *utctx;
183
184 struct list_head trace_chunks; /* list of unflushed trace chunks in fifo order */
185
186 bool enabled;
187 };
188
189 void u_trace_context_init(struct u_trace_context *utctx,
190 void *pctx,
191 u_trace_create_ts_buffer create_timestamp_buffer,
192 u_trace_delete_ts_buffer delete_timestamp_buffer,
193 u_trace_record_ts record_timestamp,
194 u_trace_read_ts read_timestamp,
195 u_trace_delete_flush_data delete_flush_data);
196 void u_trace_context_fini(struct u_trace_context *utctx);
197
198 /**
199 * Flush (trigger processing) of traces previously flushed to the trace-context
200 * by u_trace_flush().
201 *
202 * This should typically be called in the driver's pctx->flush().
203 */
204 void u_trace_context_process(struct u_trace_context *utctx, bool eof);
205
206 void u_trace_init(struct u_trace *ut, struct u_trace_context *utctx);
207 void u_trace_fini(struct u_trace *ut);
208
209 bool u_trace_has_points(struct u_trace *ut);
210
211 struct u_trace_iterator
212 {
213 struct u_trace *ut;
214 struct u_trace_chunk *chunk;
215 uint32_t event_idx;
216 };
217
218 struct u_trace_iterator
219 u_trace_begin_iterator(struct u_trace *ut);
220
221 struct u_trace_iterator
222 u_trace_end_iterator(struct u_trace *ut);
223
224 bool
225 u_trace_iterator_equal(struct u_trace_iterator a,
226 struct u_trace_iterator b);
227
228 typedef void (*u_trace_copy_ts_buffer)(struct u_trace_context *utctx,
229 void *cmdstream,
230 void *ts_from, uint32_t from_offset,
231 void *ts_to, uint32_t to_offset,
232 uint32_t count);
233
234 /**
235 * Clones tracepoints range into target u_trace.
236 * Provides callback for driver to copy timestamps on GPU from
237 * one buffer to another.
238 *
239 * It allows:
240 * - Tracing re-usable command buffer in Vulkan, by copying tracepoints
241 * each time it is submitted.
242 * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints
243 * corresponding to a tile.
244 */
245 void u_trace_clone_append(struct u_trace_iterator begin_it,
246 struct u_trace_iterator end_it,
247 struct u_trace *into,
248 void *cmdstream,
249 u_trace_copy_ts_buffer copy_ts_buffer);
250
251 void u_trace_disable_event_range(struct u_trace_iterator begin_it,
252 struct u_trace_iterator end_it);
253
254 /**
255 * Flush traces to the parent trace-context. At this point, the expectation
256 * is that all the tracepoints are "executed" by the GPU following any previously
257 * flushed u_trace batch.
258 *
259 * flush_data is a way for driver to pass additional data, which becomes available
260 * only at the point of flush, to the u_trace_read_ts callback and perfetto.
261 * The typical example of such data would be a fence to wait on in u_trace_read_ts,
262 * and a submission_id to pass into perfetto.
263 * The destruction of the data is done via u_trace_delete_flush_data.
264 *
265 * This should typically be called when the corresponding cmdstream (containing
266 * the timestamp reads) is flushed to the kernel.
267 */
268 void u_trace_flush(struct u_trace *ut, void *flush_data, bool free_data);
269
270 /**
271 * Whether command buffers should be instrumented even if not collecting
272 * traces.
273 */
274 extern bool ut_trace_instrument;
275
276 #ifdef HAVE_PERFETTO
277 extern int ut_perfetto_enabled;
278
279 void u_trace_perfetto_start(void);
280 void u_trace_perfetto_stop(void);
281 #else
282 # define ut_perfetto_enabled 0
283 #endif
284
285 static inline bool
u_trace_context_actively_tracing(struct u_trace_context * utctx)286 u_trace_context_actively_tracing(struct u_trace_context *utctx)
287 {
288 return !!utctx->out || (ut_perfetto_enabled > 0);
289 }
290
291 static inline bool
u_trace_context_instrumenting(struct u_trace_context * utctx)292 u_trace_context_instrumenting(struct u_trace_context *utctx)
293 {
294 return !!utctx->out || ut_trace_instrument || (ut_perfetto_enabled > 0);
295 }
296
297 #ifdef __cplusplus
298 }
299 #endif
300
301 #endif /* _U_TRACE_H */
302