1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25
26
27 #include "si_pipe.h"
28 #include "si_build_pm4.h"
29 #include "si_compute.h"
30
31 #include "ac_rgp.h"
32 #include "ac_sqtt.h"
33 #include "util/u_memory.h"
34 #include "tgsi/tgsi_from_mesa.h"
35
36 static void
37 si_emit_spi_config_cntl(struct si_context* sctx,
38 struct radeon_cmdbuf *cs, bool enable);
39
40 static bool
si_thread_trace_init_bo(struct si_context * sctx)41 si_thread_trace_init_bo(struct si_context *sctx)
42 {
43 unsigned max_se = sctx->screen->info.max_se;
44 struct radeon_winsys *ws = sctx->ws;
45 uint64_t size;
46
47 /* The buffer size and address need to be aligned in HW regs. Align the
48 * size as early as possible so that we do all the allocation & addressing
49 * correctly. */
50 sctx->thread_trace->buffer_size = align64(sctx->thread_trace->buffer_size,
51 1u << SQTT_BUFFER_ALIGN_SHIFT);
52
53 /* Compute total size of the thread trace BO for all SEs. */
54 size = align64(sizeof(struct ac_thread_trace_info) * max_se,
55 1 << SQTT_BUFFER_ALIGN_SHIFT);
56 size += sctx->thread_trace->buffer_size * (uint64_t)max_se;
57
58 sctx->thread_trace->bo =
59 ws->buffer_create(ws, size, 4096,
60 RADEON_DOMAIN_VRAM,
61 RADEON_FLAG_NO_INTERPROCESS_SHARING |
62 RADEON_FLAG_GTT_WC |
63 RADEON_FLAG_NO_SUBALLOC);
64 if (!sctx->thread_trace->bo)
65 return false;
66
67 return true;
68 }
69
70 static bool
si_se_is_disabled(struct si_context * sctx,unsigned se)71 si_se_is_disabled(struct si_context* sctx, unsigned se)
72 {
73 /* No active CU on the SE means it is disabled. */
74 return sctx->screen->info.cu_mask[se][0] == 0;
75 }
76
77
78 static void
si_emit_thread_trace_start(struct si_context * sctx,struct radeon_cmdbuf * cs,uint32_t queue_family_index)79 si_emit_thread_trace_start(struct si_context* sctx,
80 struct radeon_cmdbuf *cs,
81 uint32_t queue_family_index)
82 {
83 struct si_screen *sscreen = sctx->screen;
84 uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
85 unsigned max_se = sscreen->info.max_se;
86
87 radeon_begin(cs);
88
89 for (unsigned se = 0; se < max_se; se++) {
90 uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
91 uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se);
92 uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
93
94 if (si_se_is_disabled(sctx, se))
95 continue;
96
97 /* Target SEx and SH0. */
98 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
99 S_030800_SE_INDEX(se) |
100 S_030800_SH_INDEX(0) |
101 S_030800_INSTANCE_BROADCAST_WRITES(1));
102
103 /* Select the first active CUs */
104 int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
105
106 if (sctx->chip_class >= GFX10) {
107 /* Order seems important for the following 2 registers. */
108 radeon_set_privileged_config_reg(R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
109 S_008D04_SIZE(shifted_size) |
110 S_008D04_BASE_HI(shifted_va >> 32));
111
112 radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
113
114 int wgp = first_active_cu / 2;
115 radeon_set_privileged_config_reg(R_008D14_SQ_THREAD_TRACE_MASK,
116 S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */
117 S_008D14_SA_SEL(0) |
118 S_008D14_WGP_SEL(wgp) |
119 S_008D14_SIMD_SEL(0));
120
121 radeon_set_privileged_config_reg(R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
122 S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC |
123 V_008D18_REG_INCLUDE_SHDEC |
124 V_008D18_REG_INCLUDE_GFXUDEC |
125 V_008D18_REG_INCLUDE_CONTEXT |
126 V_008D18_REG_INCLUDE_COMP |
127 V_008D18_REG_INCLUDE_CONFIG) |
128 S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
129
130 /* Should be emitted last (it enables thread traces). */
131 radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
132 S_008D1C_MODE(1) |
133 S_008D1C_HIWATER(5) |
134 S_008D1C_UTIL_TIMER(1) |
135 S_008D1C_RT_FREQ(2) | /* 4096 clk */
136 S_008D1C_DRAW_EVENT_EN(1) |
137 S_008D1C_REG_STALL_EN(1) |
138 S_008D1C_SPI_STALL_EN(1) |
139 S_008D1C_SQ_STALL_EN(1) |
140 S_008D1C_REG_DROP_ON_STALL(0) |
141 S_008D1C_LOWATER_OFFSET(
142 sctx->chip_class >= GFX10_3 ? 4 : 0) |
143 S_008D1C_AUTO_FLUSH_MODE(sctx->chip_class == GFX10_3));
144 } else {
145 /* Order seems important for the following 4 registers. */
146 radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2,
147 S_030CDC_ADDR_HI(shifted_va >> 32));
148
149 radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
150
151 radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE,
152 S_030CC4_SIZE(shifted_size));
153
154 radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL,
155 S_030CD4_RESET_BUFFER(1));
156
157 uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) |
158 S_030CC8_SH_SEL(0) |
159 S_030CC8_SIMD_EN(0xf) |
160 S_030CC8_VM_ID_MASK(0) |
161 S_030CC8_REG_STALL_EN(1) |
162 S_030CC8_SPI_STALL_EN(1) |
163 S_030CC8_SQ_STALL_EN(1);
164
165 radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK,
166 thread_trace_mask);
167
168 /* Trace all tokens and registers. */
169 radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
170 S_030CCC_TOKEN_MASK(0xbfff) |
171 S_030CCC_REG_MASK(0xff) |
172 S_030CCC_REG_DROP_ON_STALL(0));
173
174 /* Enable SQTT perf counters for all CUs. */
175 radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
176 S_030CD0_SH0_MASK(0xffff) |
177 S_030CD0_SH1_MASK(0xffff));
178
179 radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
180
181 radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER,
182 S_030CEC_HIWATER(4));
183
184 if (sctx->chip_class == GFX9) {
185 /* Reset thread trace status errors. */
186 radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS,
187 S_030CE8_UTC_ERROR(0));
188 }
189
190 /* Enable the thread trace mode. */
191 uint32_t thread_trace_mode =
192 S_030CD8_MASK_PS(1) |
193 S_030CD8_MASK_VS(1) |
194 S_030CD8_MASK_GS(1) |
195 S_030CD8_MASK_ES(1) |
196 S_030CD8_MASK_HS(1) |
197 S_030CD8_MASK_LS(1) |
198 S_030CD8_MASK_CS(1) |
199 S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
200 S_030CD8_MODE(1);
201
202 if (sctx->chip_class == GFX9) {
203 /* Count SQTT traffic in TCC perf counters. */
204 thread_trace_mode |= S_030CD8_TC_PERF_EN(1);
205 }
206
207 radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE,
208 thread_trace_mode);
209 }
210 }
211
212 /* Restore global broadcasting. */
213 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
214 S_030800_SE_BROADCAST_WRITES(1) |
215 S_030800_SH_BROADCAST_WRITES(1) |
216 S_030800_INSTANCE_BROADCAST_WRITES(1));
217
218 /* Start the thread trace with a different event based on the queue. */
219 if (queue_family_index == RING_COMPUTE) {
220 radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
221 S_00B878_THREAD_TRACE_ENABLE(1));
222 } else {
223 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
224 radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
225 }
226 radeon_end();
227 }
228
229 static const uint32_t gfx9_thread_trace_info_regs[] =
230 {
231 R_030CE4_SQ_THREAD_TRACE_WPTR,
232 R_030CE8_SQ_THREAD_TRACE_STATUS,
233 R_030CF0_SQ_THREAD_TRACE_CNTR,
234 };
235
236 static const uint32_t gfx10_thread_trace_info_regs[] =
237 {
238 R_008D10_SQ_THREAD_TRACE_WPTR,
239 R_008D20_SQ_THREAD_TRACE_STATUS,
240 R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
241 };
242
243 static void
si_copy_thread_trace_info_regs(struct si_context * sctx,struct radeon_cmdbuf * cs,unsigned se_index)244 si_copy_thread_trace_info_regs(struct si_context* sctx,
245 struct radeon_cmdbuf *cs,
246 unsigned se_index)
247 {
248 const uint32_t *thread_trace_info_regs = NULL;
249
250 switch (sctx->chip_class) {
251 case GFX10_3:
252 case GFX10:
253 thread_trace_info_regs = gfx10_thread_trace_info_regs;
254 break;
255 case GFX9:
256 thread_trace_info_regs = gfx9_thread_trace_info_regs;
257 break;
258 default:
259 unreachable("Unsupported chip_class");
260 }
261
262 /* Get the VA where the info struct is stored for this SE. */
263 uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
264 uint64_t info_va = ac_thread_trace_get_info_va(va, se_index);
265
266 radeon_begin(cs);
267
268 /* Copy back the info struct one DWORD at a time. */
269 for (unsigned i = 0; i < 3; i++) {
270 radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
271 radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
272 COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
273 COPY_DATA_WR_CONFIRM);
274 radeon_emit(thread_trace_info_regs[i] >> 2);
275 radeon_emit(0); /* unused */
276 radeon_emit((info_va + i * 4));
277 radeon_emit((info_va + i * 4) >> 32);
278 }
279 radeon_end();
280 }
281
282
283
284 static void
si_emit_thread_trace_stop(struct si_context * sctx,struct radeon_cmdbuf * cs,uint32_t queue_family_index)285 si_emit_thread_trace_stop(struct si_context *sctx,
286 struct radeon_cmdbuf *cs,
287 uint32_t queue_family_index)
288 {
289 unsigned max_se = sctx->screen->info.max_se;
290
291 radeon_begin(cs);
292
293 /* Stop the thread trace with a different event based on the queue. */
294 if (queue_family_index == RING_COMPUTE) {
295 radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
296 S_00B878_THREAD_TRACE_ENABLE(0));
297 } else {
298 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
299 radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
300 }
301
302 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
303 radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
304 radeon_end();
305
306 for (unsigned se = 0; se < max_se; se++) {
307 if (si_se_is_disabled(sctx, se))
308 continue;
309
310 radeon_begin(cs);
311
312 /* Target SEi and SH0. */
313 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
314 S_030800_SE_INDEX(se) |
315 S_030800_SH_INDEX(0) |
316 S_030800_INSTANCE_BROADCAST_WRITES(1));
317
318 if (sctx->chip_class >= GFX10) {
319 /* Make sure to wait for the trace buffer. */
320 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
321 radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
322 radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
323 radeon_emit(0);
324 radeon_emit(0); /* reference value */
325 radeon_emit(~C_008D20_FINISH_DONE); /* mask */
326 radeon_emit(4); /* poll interval */
327
328 /* Disable the thread trace mode. */
329 radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
330 S_008D1C_MODE(0));
331
332 /* Wait for thread trace completion. */
333 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
334 radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
335 radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
336 radeon_emit(0);
337 radeon_emit(0); /* reference value */
338 radeon_emit(~C_008D20_BUSY); /* mask */
339 radeon_emit(4); /* poll interval */
340 } else {
341 /* Disable the thread trace mode. */
342 radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE,
343 S_030CD8_MODE(0));
344
345 /* Wait for thread trace completion. */
346 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
347 radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
348 radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
349 radeon_emit(0);
350 radeon_emit(0); /* reference value */
351 radeon_emit(~C_030CE8_BUSY); /* mask */
352 radeon_emit(4); /* poll interval */
353 }
354 radeon_end();
355
356 si_copy_thread_trace_info_regs(sctx, cs, se);
357 }
358
359 /* Restore global broadcasting. */
360 radeon_begin_again(cs);
361 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
362 S_030800_SE_BROADCAST_WRITES(1) |
363 S_030800_SH_BROADCAST_WRITES(1) |
364 S_030800_INSTANCE_BROADCAST_WRITES(1));
365 radeon_end();
366 }
367
368 static void
si_thread_trace_start(struct si_context * sctx,int family,struct radeon_cmdbuf * cs)369 si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
370 {
371 struct radeon_winsys *ws = sctx->ws;
372
373 radeon_begin(cs);
374
375 switch (family) {
376 case RING_GFX:
377 radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
378 radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
379 radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
380 break;
381 case RING_COMPUTE:
382 radeon_emit(PKT3(PKT3_NOP, 0, 0));
383 radeon_emit(0);
384 break;
385 }
386 radeon_end();
387
388 ws->cs_add_buffer(cs,
389 sctx->thread_trace->bo,
390 RADEON_USAGE_READWRITE,
391 RADEON_DOMAIN_VRAM);
392
393 si_cp_dma_wait_for_idle(sctx, cs);
394
395 /* Make sure to wait-for-idle before starting SQTT. */
396 sctx->flags |=
397 SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
398 SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
399 SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
400 sctx->emit_cache_flush(sctx, cs);
401
402 si_inhibit_clockgating(sctx, cs, true);
403
404 /* Enable SQG events that collects thread trace data. */
405 si_emit_spi_config_cntl(sctx, cs, true);
406
407 si_emit_thread_trace_start(sctx, cs, family);
408 }
409
410 static void
si_thread_trace_stop(struct si_context * sctx,int family,struct radeon_cmdbuf * cs)411 si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
412 {
413 struct radeon_winsys *ws = sctx->ws;
414
415 radeon_begin(cs);
416
417 switch (family) {
418 case RING_GFX:
419 radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
420 radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
421 radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
422 break;
423 case RING_COMPUTE:
424 radeon_emit(PKT3(PKT3_NOP, 0, 0));
425 radeon_emit(0);
426 break;
427 }
428 radeon_end();
429
430 ws->cs_add_buffer(cs,
431 sctx->thread_trace->bo,
432 RADEON_USAGE_READWRITE,
433 RADEON_DOMAIN_VRAM);
434
435 si_cp_dma_wait_for_idle(sctx, cs);
436
437 /* Make sure to wait-for-idle before stopping SQTT. */
438 sctx->flags |=
439 SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
440 SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
441 SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
442 sctx->emit_cache_flush(sctx, cs);
443
444 si_emit_thread_trace_stop(sctx, cs, family);
445
446 /* Restore previous state by disabling SQG events. */
447 si_emit_spi_config_cntl(sctx, cs, false);
448
449 si_inhibit_clockgating(sctx, cs, false);
450 }
451
452
453 static void
si_thread_trace_init_cs(struct si_context * sctx)454 si_thread_trace_init_cs(struct si_context *sctx)
455 {
456 struct radeon_winsys *ws = sctx->ws;
457
458 /* Thread trace start CS (only handles RING_GFX). */
459 sctx->thread_trace->start_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
460 if (!ws->cs_create(sctx->thread_trace->start_cs[RING_GFX],
461 sctx->ctx, RING_GFX, NULL, NULL, 0)) {
462 free(sctx->thread_trace->start_cs[RING_GFX]);
463 sctx->thread_trace->start_cs[RING_GFX] = NULL;
464 return;
465 }
466
467 si_thread_trace_start(sctx, RING_GFX, sctx->thread_trace->start_cs[RING_GFX]);
468
469 /* Thread trace stop CS. */
470 sctx->thread_trace->stop_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
471 if (!ws->cs_create(sctx->thread_trace->stop_cs[RING_GFX],
472 sctx->ctx, RING_GFX, NULL, NULL, 0)) {
473 free(sctx->thread_trace->start_cs[RING_GFX]);
474 sctx->thread_trace->start_cs[RING_GFX] = NULL;
475 free(sctx->thread_trace->stop_cs[RING_GFX]);
476 sctx->thread_trace->stop_cs[RING_GFX] = NULL;
477 return;
478 }
479
480 si_thread_trace_stop(sctx, RING_GFX, sctx->thread_trace->stop_cs[RING_GFX]);
481 }
482
483 static void
si_begin_thread_trace(struct si_context * sctx,struct radeon_cmdbuf * rcs)484 si_begin_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
485 {
486 struct radeon_cmdbuf *cs = sctx->thread_trace->start_cs[RING_GFX];
487 sctx->ws->cs_flush(cs, 0, NULL);
488 }
489
490 static void
si_end_thread_trace(struct si_context * sctx,struct radeon_cmdbuf * rcs)491 si_end_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
492 {
493 struct radeon_cmdbuf *cs = sctx->thread_trace->stop_cs[RING_GFX];
494 sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
495 }
496
497 static bool
si_get_thread_trace(struct si_context * sctx,struct ac_thread_trace * thread_trace)498 si_get_thread_trace(struct si_context *sctx,
499 struct ac_thread_trace *thread_trace)
500 {
501 unsigned max_se = sctx->screen->info.max_se;
502
503 memset(thread_trace, 0, sizeof(*thread_trace));
504 thread_trace->num_traces = max_se;
505
506 sctx->thread_trace->ptr = sctx->ws->buffer_map(sctx->ws, sctx->thread_trace->bo,
507 NULL,
508 PIPE_MAP_READ);
509
510 if (!sctx->thread_trace->ptr)
511 return false;
512
513 void *thread_trace_ptr = sctx->thread_trace->ptr;
514
515 for (unsigned se = 0; se < max_se; se++) {
516 uint64_t info_offset = ac_thread_trace_get_info_offset(se);
517 uint64_t data_offset = ac_thread_trace_get_data_offset(&sctx->screen->info, sctx->thread_trace, se);
518 void *info_ptr = thread_trace_ptr + info_offset;
519 void *data_ptr = thread_trace_ptr + data_offset;
520 struct ac_thread_trace_info *info =
521 (struct ac_thread_trace_info *)info_ptr;
522
523 struct ac_thread_trace_se thread_trace_se = {0};
524
525 if (!ac_is_thread_trace_complete(&sctx->screen->info, sctx->thread_trace, info)) {
526 uint32_t expected_size =
527 ac_get_expected_buffer_size(&sctx->screen->info, info);
528 uint32_t available_size = (info->cur_offset * 32) / 1024;
529
530 fprintf(stderr, "Failed to get the thread trace "
531 "because the buffer is too small. The "
532 "hardware needs %d KB but the "
533 "buffer size is %d KB.\n",
534 expected_size, available_size);
535 fprintf(stderr, "Please update the buffer size with "
536 "AMD_THREAD_TRACE_BUFFER_SIZE=<size_in_kbytes>\n");
537 return false;
538 }
539
540 thread_trace_se.data_ptr = data_ptr;
541 thread_trace_se.info = *info;
542 thread_trace_se.shader_engine = se;
543
544 int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
545
546 /* For GFX10+ compute_unit really means WGP */
547 thread_trace_se.compute_unit =
548 sctx->screen->info.chip_class >= GFX10 ? (first_active_cu / 2) : first_active_cu;
549
550 thread_trace->traces[se] = thread_trace_se;
551 }
552
553 thread_trace->data = sctx->thread_trace;
554 return true;
555 }
556
557
558 bool
si_init_thread_trace(struct si_context * sctx)559 si_init_thread_trace(struct si_context *sctx)
560 {
561 static bool warn_once = true;
562 if (warn_once) {
563 fprintf(stderr, "*************************************************\n");
564 fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
565 fprintf(stderr, "*************************************************\n");
566 warn_once = false;
567 }
568
569 sctx->thread_trace = CALLOC_STRUCT(ac_thread_trace_data);
570
571 if (sctx->chip_class < GFX8) {
572 fprintf(stderr, "GPU hardware not supported: refer to "
573 "the RGP documentation for the list of "
574 "supported GPUs!\n");
575 return false;
576 }
577
578 if (sctx->chip_class > GFX10_3) {
579 fprintf(stderr, "radeonsi: Thread trace is not supported "
580 "for that GPU!\n");
581 return false;
582 }
583
584 /* Default buffer size set to 32MB per SE. */
585 sctx->thread_trace->buffer_size = debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
586 sctx->thread_trace->start_frame = 10;
587
588 const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
589 if (trigger) {
590 sctx->thread_trace->start_frame = atoi(trigger);
591 if (sctx->thread_trace->start_frame <= 0) {
592 /* This isn't a frame number, must be a file */
593 sctx->thread_trace->trigger_file = strdup(trigger);
594 sctx->thread_trace->start_frame = -1;
595 }
596 }
597
598 if (!si_thread_trace_init_bo(sctx))
599 return false;
600
601 list_inithead(&sctx->thread_trace->rgp_pso_correlation.record);
602 simple_mtx_init(&sctx->thread_trace->rgp_pso_correlation.lock, mtx_plain);
603
604 list_inithead(&sctx->thread_trace->rgp_loader_events.record);
605 simple_mtx_init(&sctx->thread_trace->rgp_loader_events.lock, mtx_plain);
606
607 list_inithead(&sctx->thread_trace->rgp_code_object.record);
608 simple_mtx_init(&sctx->thread_trace->rgp_code_object.lock, mtx_plain);
609
610 si_thread_trace_init_cs(sctx);
611
612 sctx->sqtt_next_event = EventInvalid;
613
614 return true;
615 }
616
617 void
si_destroy_thread_trace(struct si_context * sctx)618 si_destroy_thread_trace(struct si_context *sctx)
619 {
620 struct si_screen *sscreen = sctx->screen;
621 struct pb_buffer *bo = sctx->thread_trace->bo;
622 radeon_bo_reference(sctx->screen->ws, &bo, NULL);
623
624 if (sctx->thread_trace->trigger_file)
625 free(sctx->thread_trace->trigger_file);
626
627 sscreen->ws->cs_destroy(sctx->thread_trace->start_cs[RING_GFX]);
628 sscreen->ws->cs_destroy(sctx->thread_trace->stop_cs[RING_GFX]);
629
630 struct rgp_pso_correlation *pso_correlation = &sctx->thread_trace->rgp_pso_correlation;
631 struct rgp_loader_events *loader_events = &sctx->thread_trace->rgp_loader_events;
632 struct rgp_code_object *code_object = &sctx->thread_trace->rgp_code_object;
633 list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
634 &pso_correlation->record, list) {
635 list_del(&record->list);
636 free(record);
637 }
638 simple_mtx_destroy(&sctx->thread_trace->rgp_pso_correlation.lock);
639
640 list_for_each_entry_safe(struct rgp_loader_events_record, record,
641 &loader_events->record, list) {
642 list_del(&record->list);
643 free(record);
644 }
645 simple_mtx_destroy(&sctx->thread_trace->rgp_loader_events.lock);
646
647 list_for_each_entry_safe(struct rgp_code_object_record, record,
648 &code_object->record, list) {
649 uint32_t mask = record->shader_stages_mask;
650 int i;
651
652 /* Free the disassembly. */
653 while (mask) {
654 i = u_bit_scan(&mask);
655 free(record->shader_data[i].code);
656 }
657 list_del(&record->list);
658 free(record);
659 }
660 simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock);
661
662 free(sctx->thread_trace);
663 sctx->thread_trace = NULL;
664 }
665
666 static uint64_t num_frames = 0;
667
668 void
si_handle_thread_trace(struct si_context * sctx,struct radeon_cmdbuf * rcs)669 si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
670 {
671 /* Should we enable SQTT yet? */
672 if (!sctx->thread_trace_enabled) {
673 bool frame_trigger = num_frames == sctx->thread_trace->start_frame;
674 bool file_trigger = false;
675 if (sctx->thread_trace->trigger_file &&
676 access(sctx->thread_trace->trigger_file, W_OK) == 0) {
677 if (unlink(sctx->thread_trace->trigger_file) == 0) {
678 file_trigger = true;
679 } else {
680 /* Do not enable tracing if we cannot remove the file,
681 * because by then we'll trace every frame.
682 */
683 fprintf(stderr, "radeonsi: could not remove thread trace trigger file, ignoring\n");
684 }
685 }
686
687 if (frame_trigger || file_trigger) {
688 /* Wait for last submission */
689 sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence, PIPE_TIMEOUT_INFINITE);
690
691 /* Start SQTT */
692 si_begin_thread_trace(sctx, rcs);
693
694 sctx->thread_trace_enabled = true;
695 sctx->thread_trace->start_frame = -1;
696
697 /* Force shader update to make sure si_sqtt_describe_pipeline_bind is called
698 * for the current "pipeline".
699 */
700 sctx->do_update_shaders = true;
701 }
702 } else {
703 struct ac_thread_trace thread_trace = {0};
704
705 /* Stop SQTT */
706 si_end_thread_trace(sctx, rcs);
707 sctx->thread_trace_enabled = false;
708 sctx->thread_trace->start_frame = -1;
709 assert (sctx->last_sqtt_fence);
710
711 /* Wait for SQTT to finish and read back the bo */
712 if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence, PIPE_TIMEOUT_INFINITE) &&
713 si_get_thread_trace(sctx, &thread_trace)) {
714 ac_dump_rgp_capture(&sctx->screen->info, &thread_trace, NULL);
715 } else {
716 fprintf(stderr, "Failed to read the trace\n");
717 }
718 }
719
720 num_frames++;
721 }
722
723
724 static void
si_emit_thread_trace_userdata(struct si_context * sctx,struct radeon_cmdbuf * cs,const void * data,uint32_t num_dwords)725 si_emit_thread_trace_userdata(struct si_context* sctx,
726 struct radeon_cmdbuf *cs,
727 const void *data, uint32_t num_dwords)
728 {
729 const uint32_t *dwords = (uint32_t *)data;
730
731 radeon_begin(cs);
732
733 while (num_dwords > 0) {
734 uint32_t count = MIN2(num_dwords, 2);
735
736 /* Without the perfctr bit the CP might not always pass the
737 * write on correctly. */
738 radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->chip_class >= GFX10);
739
740 radeon_emit_array(dwords, count);
741
742 dwords += count;
743 num_dwords -= count;
744 }
745 radeon_end();
746 }
747
748 static void
si_emit_spi_config_cntl(struct si_context * sctx,struct radeon_cmdbuf * cs,bool enable)749 si_emit_spi_config_cntl(struct si_context* sctx,
750 struct radeon_cmdbuf *cs, bool enable)
751 {
752 radeon_begin(cs);
753
754 if (sctx->chip_class >= GFX9) {
755 uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
756 S_031100_EXP_PRIORITY_ORDER(3) |
757 S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
758 S_031100_ENABLE_SQG_BOP_EVENTS(enable);
759
760 if (sctx->chip_class >= GFX10)
761 spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
762
763 radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
764 } else {
765 /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
766 radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
767 S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
768 S_009100_ENABLE_SQG_BOP_EVENTS(enable));
769 }
770 radeon_end();
771 }
772
773 static uint32_t num_events = 0;
774 void
si_sqtt_write_event_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t vertex_offset_user_data,uint32_t instance_offset_user_data,uint32_t draw_index_user_data)775 si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,
776 enum rgp_sqtt_marker_event_type api_type,
777 uint32_t vertex_offset_user_data,
778 uint32_t instance_offset_user_data,
779 uint32_t draw_index_user_data)
780 {
781 struct rgp_sqtt_marker_event marker = {0};
782
783 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
784 marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;
785 marker.cmd_id = num_events++;
786 marker.cb_id = 0;
787
788 if (vertex_offset_user_data == UINT_MAX ||
789 instance_offset_user_data == UINT_MAX) {
790 vertex_offset_user_data = 0;
791 instance_offset_user_data = 0;
792 }
793
794 if (draw_index_user_data == UINT_MAX)
795 draw_index_user_data = vertex_offset_user_data;
796
797 marker.vertex_offset_reg_idx = vertex_offset_user_data;
798 marker.instance_offset_reg_idx = instance_offset_user_data;
799 marker.draw_index_reg_idx = draw_index_user_data;
800
801 si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
802
803 sctx->sqtt_next_event = EventInvalid;
804 }
805
806 void
si_write_event_with_dims_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t x,uint32_t y,uint32_t z)807 si_write_event_with_dims_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,
808 enum rgp_sqtt_marker_event_type api_type,
809 uint32_t x, uint32_t y, uint32_t z)
810 {
811 struct rgp_sqtt_marker_event_with_dims marker = {0};
812
813 marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
814 marker.event.api_type = api_type;
815 marker.event.cmd_id = num_events++;
816 marker.event.cb_id = 0;
817 marker.event.has_thread_dims = 1;
818
819 marker.thread_x = x;
820 marker.thread_y = y;
821 marker.thread_z = z;
822
823 si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
824 sctx->sqtt_next_event = EventInvalid;
825 }
826
827 void
si_sqtt_describe_barrier_start(struct si_context * sctx,struct radeon_cmdbuf * rcs)828 si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rcs)
829 {
830 struct rgp_sqtt_marker_barrier_start marker = {0};
831
832 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
833 marker.cb_id = 0;
834 marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
835
836 si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
837 }
838
839 void
si_sqtt_describe_barrier_end(struct si_context * sctx,struct radeon_cmdbuf * rcs,unsigned flags)840 si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs,
841 unsigned flags)
842 {
843 struct rgp_sqtt_marker_barrier_end marker = {0};
844
845 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
846 marker.cb_id = 0;
847
848 if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH)
849 marker.vs_partial_flush = true;
850 if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH)
851 marker.ps_partial_flush = true;
852 if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH)
853 marker.cs_partial_flush = true;
854
855 if (flags & SI_CONTEXT_PFP_SYNC_ME)
856 marker.pfp_sync_me = true;
857
858 if (flags & SI_CONTEXT_INV_VCACHE)
859 marker.inval_tcp = true;
860 if (flags & SI_CONTEXT_INV_ICACHE)
861 marker.inval_sqI = true;
862 if (flags & SI_CONTEXT_INV_SCACHE)
863 marker.inval_sqK = true;
864 if (flags & SI_CONTEXT_INV_L2)
865 marker.inval_tcc = true;
866
867 if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
868 marker.inval_cb = true;
869 marker.flush_cb = true;
870 }
871 if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
872 marker.inval_db = true;
873 marker.flush_db = true;
874 }
875
876 si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
877 }
878
879 void
si_write_user_event(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_user_event_type type,const char * str,int len)880 si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs,
881 enum rgp_sqtt_marker_user_event_type type,
882 const char *str, int len)
883 {
884 if (type == UserEventPop) {
885 assert (str == NULL);
886 struct rgp_sqtt_marker_user_event marker = { 0 };
887 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
888 marker.data_type = type;
889
890 si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
891 } else {
892 assert (str != NULL);
893 struct rgp_sqtt_marker_user_event_with_length marker = { 0 };
894 marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
895 marker.user_event.data_type = type;
896 len = MIN2(1024, len);
897 marker.length = align(len, 4);
898
899 uint8_t *buffer = alloca(sizeof(marker) + marker.length);
900 memcpy(buffer, &marker, sizeof(marker));
901 memcpy(buffer + sizeof(marker), str, len);
902 buffer[sizeof(marker) + len - 1] = '\0';
903
904 si_emit_thread_trace_userdata(sctx, rcs, buffer, sizeof(marker) / 4 + marker.length / 4);
905 }
906 }
907
908
909 bool
si_sqtt_pipeline_is_registered(struct ac_thread_trace_data * thread_trace_data,uint64_t pipeline_hash)910 si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,
911 uint64_t pipeline_hash)
912 {
913 simple_mtx_lock(&thread_trace_data->rgp_pso_correlation.lock);
914 list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
915 &thread_trace_data->rgp_pso_correlation.record, list) {
916 if (record->pipeline_hash[0] == pipeline_hash) {
917 simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
918 return true;
919 }
920
921 }
922 simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
923
924 return false;
925 }
926
927
928
929 static enum rgp_hardware_stages
si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key * key,enum pipe_shader_type stage)930 si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key* key, enum pipe_shader_type stage)
931 {
932 switch (stage) {
933 case PIPE_SHADER_VERTEX:
934 if (key->ge.as_ls)
935 return RGP_HW_STAGE_LS;
936 else if (key->ge.as_es)
937 return RGP_HW_STAGE_ES;
938 else if (key->ge.as_ngg)
939 return RGP_HW_STAGE_GS;
940 else
941 return RGP_HW_STAGE_VS;
942 case PIPE_SHADER_TESS_CTRL:
943 return RGP_HW_STAGE_HS;
944 case PIPE_SHADER_TESS_EVAL:
945 if (key->ge.as_es)
946 return RGP_HW_STAGE_ES;
947 else if (key->ge.as_ngg)
948 return RGP_HW_STAGE_GS;
949 else
950 return RGP_HW_STAGE_VS;
951 case PIPE_SHADER_GEOMETRY:
952 return RGP_HW_STAGE_GS;
953 case PIPE_SHADER_FRAGMENT:
954 return RGP_HW_STAGE_PS;
955 case PIPE_SHADER_COMPUTE:
956 return RGP_HW_STAGE_CS;
957 default:
958 unreachable("invalid mesa shader stage");
959 }
960 }
961
962 static bool
si_sqtt_add_code_object(struct si_context * sctx,uint64_t pipeline_hash,bool is_compute)963 si_sqtt_add_code_object(struct si_context* sctx,
964 uint64_t pipeline_hash,
965 bool is_compute)
966 {
967 struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
968 struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object;
969 struct rgp_code_object_record *record;
970
971 record = malloc(sizeof(struct rgp_code_object_record));
972 if (!record)
973 return false;
974
975 record->shader_stages_mask = 0;
976 record->num_shaders_combined = 0;
977 record->pipeline_hash[0] = pipeline_hash;
978 record->pipeline_hash[1] = pipeline_hash;
979
980 for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
981 struct si_shader *shader;
982 enum rgp_hardware_stages hw_stage;
983
984 if (is_compute) {
985 if (i != PIPE_SHADER_COMPUTE)
986 continue;
987 shader = &sctx->cs_shader_state.program->shader;
988 hw_stage = RGP_HW_STAGE_CS;
989 } else if (i != PIPE_SHADER_COMPUTE) {
990 if (!sctx->shaders[i].cso || !sctx->shaders[i].current)
991 continue;
992 shader = sctx->shaders[i].current;
993 hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);
994 } else {
995 continue;
996 }
997
998 uint8_t *code = malloc(shader->binary.uploaded_code_size);
999 if (!code) {
1000 free(record);
1001 return false;
1002 }
1003 memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
1004
1005 uint64_t va = shader->bo->gpu_address;
1006 unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i);
1007 record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
1008 record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0];
1009 record->shader_data[gl_shader_stage].code_size = shader->binary.uploaded_code_size;
1010 record->shader_data[gl_shader_stage].code = code;
1011 record->shader_data[gl_shader_stage].vgpr_count = shader->config.num_vgprs;
1012 record->shader_data[gl_shader_stage].sgpr_count = shader->config.num_sgprs;
1013 record->shader_data[gl_shader_stage].base_address = va & 0xffffffffffff;
1014 record->shader_data[gl_shader_stage].elf_symbol_offset = 0;
1015 record->shader_data[gl_shader_stage].hw_stage = hw_stage;
1016 record->shader_data[gl_shader_stage].is_combined = false;
1017 record->shader_data[gl_shader_stage].scratch_memory_size = shader->config.scratch_bytes_per_wave;
1018 record->shader_data[gl_shader_stage].wavefront_size = shader->wave_size;
1019
1020 record->shader_stages_mask |= 1 << gl_shader_stage;
1021 record->num_shaders_combined++;
1022 }
1023
1024 simple_mtx_lock(&code_object->lock);
1025 list_addtail(&record->list, &code_object->record);
1026 code_object->record_count++;
1027 simple_mtx_unlock(&code_object->lock);
1028
1029 return true;
1030 }
1031
1032 bool
si_sqtt_register_pipeline(struct si_context * sctx,uint64_t pipeline_hash,uint64_t base_address,bool is_compute)1033 si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute)
1034 {
1035 struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
1036
1037 assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash));
1038
1039 bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash);
1040 if (!result)
1041 return false;
1042
1043 result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address);
1044 if (!result)
1045 return false;
1046
1047 return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute);
1048 }
1049
1050 void
si_sqtt_describe_pipeline_bind(struct si_context * sctx,uint64_t pipeline_hash,int bind_point)1051 si_sqtt_describe_pipeline_bind(struct si_context* sctx,
1052 uint64_t pipeline_hash,
1053 int bind_point)
1054 {
1055 struct rgp_sqtt_marker_pipeline_bind marker = {0};
1056 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1057
1058 if (likely(!sctx->thread_trace_enabled)) {
1059 return;
1060 }
1061
1062 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
1063 marker.cb_id = 0;
1064 marker.bind_point = bind_point;
1065 marker.api_pso_hash[0] = pipeline_hash;
1066 marker.api_pso_hash[1] = pipeline_hash >> 32;
1067
1068 si_emit_thread_trace_userdata(sctx, cs, &marker, sizeof(marker) / 4);
1069 }
1070