1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * Copyright 2018 Advanced Micro Devices, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * on the rights to use, copy, modify, merge, publish, distribute, sub
10  * license, and/or sell copies of the Software, and to permit persons to whom
11  * the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23  * USE OR OTHER DEALINGS IN THE SOFTWARE.
24  */
25 
26 #include "si_build_pm4.h"
27 #include "si_pipe.h"
28 #include "sid.h"
29 #include "util/os_time.h"
30 #include "util/u_log.h"
31 #include "util/u_upload_mgr.h"
32 #include "ac_debug.h"
33 
si_flush_gfx_cs(struct si_context * ctx,unsigned flags,struct pipe_fence_handle ** fence)34 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
35 {
36    struct radeon_cmdbuf *cs = &ctx->gfx_cs;
37    struct radeon_winsys *ws = ctx->ws;
38    struct si_screen *sscreen = ctx->screen;
39    const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
40    unsigned wait_flags = 0;
41 
42    if (ctx->gfx_flush_in_progress)
43       return;
44 
45    /* The amdgpu kernel driver synchronizes execution for shared DMABUFs between
46     * processes on DRM >= 3.39.0, so we don't have to wait at the end of IBs to
47     * make sure everything is idle.
48     *
49     * The amdgpu winsys synchronizes execution for buffers shared by different
50     * contexts within the same process.
51     *
52     * Interop with AMDVLK, RADV, or OpenCL within the same process requires
53     * explicit fences or glFinish.
54     */
55    if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 39)
56       flags |= RADEON_FLUSH_START_NEXT_GFX_IB_NOW;
57 
58    if (!sscreen->info.kernel_flushes_tc_l2_after_ib) {
59       wait_flags |= wait_ps_cs | SI_CONTEXT_INV_L2;
60    } else if (ctx->chip_class == GFX6) {
61       /* The kernel flushes L2 before shaders are finished. */
62       wait_flags |= wait_ps_cs;
63    } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW) ||
64               ((flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION) &&
65                 !ws->cs_is_secure(cs))) {
66       /* TODO: this workaround fixes subtitles rendering with mpv -vo=vaapi and
67        * tmz but shouldn't be necessary.
68        */
69       wait_flags |= wait_ps_cs;
70    }
71 
72    /* Drop this flush if it's a no-op. */
73    if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
74        (!wait_flags || !ctx->gfx_last_ib_is_busy) &&
75        !(flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)) {
76       tc_driver_internal_flush_notify(ctx->tc);
77       return;
78    }
79 
80    /* Non-aux contexts must set up no-op API dispatch on GPU resets. This is
81     * similar to si_get_reset_status but here we can ignore soft-recoveries,
82     * while si_get_reset_status can't. */
83    if (!(ctx->context_flags & SI_CONTEXT_FLAG_AUX) &&
84        ctx->device_reset_callback.reset) {
85       enum pipe_reset_status status = ctx->ws->ctx_query_reset_status(ctx->ctx, true, NULL);
86       if (status != PIPE_NO_RESET)
87          ctx->device_reset_callback.reset(ctx->device_reset_callback.data, status);
88    }
89 
90    if (sscreen->debug_flags & DBG(CHECK_VM))
91       flags &= ~PIPE_FLUSH_ASYNC;
92 
93    ctx->gfx_flush_in_progress = true;
94 
95    if (ctx->has_graphics) {
96       if (!list_is_empty(&ctx->active_queries))
97          si_suspend_queries(ctx);
98 
99       ctx->streamout.suspended = false;
100       if (ctx->streamout.begin_emitted) {
101          si_emit_streamout_end(ctx);
102          ctx->streamout.suspended = true;
103 
104          /* Since NGG streamout uses GDS, we need to make GDS
105           * idle when we leave the IB, otherwise another process
106           * might overwrite it while our shaders are busy.
107           */
108          if (sscreen->use_ngg_streamout)
109             wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
110       }
111    }
112 
113    /* Make sure CP DMA is idle at the end of IBs after L2 prefetches
114     * because the kernel doesn't wait for it. */
115    if (ctx->chip_class >= GFX7)
116       si_cp_dma_wait_for_idle(ctx, &ctx->gfx_cs);
117 
118    /* Wait for draw calls to finish if needed. */
119    if (wait_flags) {
120       ctx->flags |= wait_flags;
121       ctx->emit_cache_flush(ctx, &ctx->gfx_cs);
122    }
123    ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
124 
125    if (ctx->current_saved_cs) {
126       si_trace_emit(ctx);
127 
128       /* Save the IB for debug contexts. */
129       si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
130       ctx->current_saved_cs->flushed = true;
131       ctx->current_saved_cs->time_flush = os_time_get_nano();
132 
133       si_log_hw_flush(ctx);
134    }
135 
136    if (sscreen->debug_flags & DBG(IB))
137       si_print_current_ib(ctx, stderr);
138 
139    if (ctx->is_noop)
140       flags |= RADEON_FLUSH_NOOP;
141 
142    /* Flush the CS. */
143    ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
144 
145    tc_driver_internal_flush_notify(ctx->tc);
146    if (fence)
147       ws->fence_reference(fence, ctx->last_gfx_fence);
148 
149    ctx->num_gfx_cs_flushes++;
150 
151    /* Check VM faults if needed. */
152    if (sscreen->debug_flags & DBG(CHECK_VM)) {
153       /* Use conservative timeout 800ms, after which we won't wait any
154        * longer and assume the GPU is hung.
155        */
156       ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800 * 1000 * 1000);
157 
158       si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
159    }
160 
161    if (unlikely(ctx->thread_trace &&
162                 (flags & PIPE_FLUSH_END_OF_FRAME))) {
163       si_handle_thread_trace(ctx, &ctx->gfx_cs);
164    }
165 
166    if (ctx->current_saved_cs)
167       si_saved_cs_reference(&ctx->current_saved_cs, NULL);
168 
169    si_begin_new_gfx_cs(ctx, false);
170    ctx->gfx_flush_in_progress = false;
171 }
172 
si_begin_gfx_cs_debug(struct si_context * ctx)173 static void si_begin_gfx_cs_debug(struct si_context *ctx)
174 {
175    static const uint32_t zeros[1];
176    assert(!ctx->current_saved_cs);
177 
178    ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
179    if (!ctx->current_saved_cs)
180       return;
181 
182    pipe_reference_init(&ctx->current_saved_cs->reference, 1);
183 
184    ctx->current_saved_cs->trace_buf =
185       si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 4));
186    if (!ctx->current_saved_cs->trace_buf) {
187       free(ctx->current_saved_cs);
188       ctx->current_saved_cs = NULL;
189       return;
190    }
191 
192    pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b, 0, sizeof(zeros),
193                                zeros);
194    ctx->current_saved_cs->trace_id = 0;
195 
196    si_trace_emit(ctx);
197 
198    radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
199                              RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
200 }
201 
si_add_gds_to_buffer_list(struct si_context * sctx)202 static void si_add_gds_to_buffer_list(struct si_context *sctx)
203 {
204    if (sctx->gds) {
205       sctx->ws->cs_add_buffer(&sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
206       if (sctx->gds_oa) {
207          sctx->ws->cs_add_buffer(&sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
208       }
209    }
210 }
211 
si_allocate_gds(struct si_context * sctx)212 void si_allocate_gds(struct si_context *sctx)
213 {
214    struct radeon_winsys *ws = sctx->ws;
215 
216    if (sctx->gds)
217       return;
218 
219    assert(sctx->screen->use_ngg_streamout);
220 
221    /* 4 streamout GDS counters.
222     * We need 256B (64 dw) of GDS, otherwise streamout hangs.
223     */
224    sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, RADEON_FLAG_DRIVER_INTERNAL);
225    sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, RADEON_FLAG_DRIVER_INTERNAL);
226 
227    assert(sctx->gds && sctx->gds_oa);
228    si_add_gds_to_buffer_list(sctx);
229 }
230 
si_set_tracked_regs_to_clear_state(struct si_context * ctx)231 void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
232 {
233    STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);
234 
235    ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
236    ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
237    ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
238    ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
239    ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
240    ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
241    ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
242    ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
243    ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
244    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;
245    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;
246    ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;
247    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
248    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
249    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
250    ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL] = 0x00000000;
251    ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
252    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
253    ctx->tracked_regs.reg_value[SI_TRACKED_DB_VRS_OVERRIDE_CNTL] = 0x00000000;
254    ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
255    ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
256    ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
257    ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
258    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
259    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
260    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
261    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;
262    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE]  = 0x00000000;
263    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1]  = 0x00000000;
264    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2]  = 0x00000000;
265    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3]  = 0x00000000;
266    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE]  = 0x00000000;
267    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT]  = 0x00000000;
268    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE]  = 0x00000000;
269    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1]  = 0x00000000;
270    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2]  = 0x00000000;
271    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3]  = 0x00000000;
272    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT]  = 0x00000000;
273    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL]  = 0x00000000;
274    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP]  = 0x00000000;
275    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE]  = 0x00000000;
276    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN]  = 0x00000000;
277    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF]  = 0x00000000;
278    ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG]  = 0x00000000;
279    ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP]  = 0x00000000;
280    ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL]  = 0x00000000;
281    ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT]  = 0x00000000;
282    ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT]  = 0x00000000;
283    ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL]  = 0x00000000;
284    ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL]  = 0x00000000;
285    ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA]  = 0x00000000;
286    ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR]  = 0x00000000;
287    ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL]  = 0x00000000;
288    ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL]  = 0x00000002;
289    ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT]  = 0x00000000;
290    ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT]  = 0x00000000;
291    ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK]  = 0xffffffff;
292    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM]  = 0x00000000;
293    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL]  = 0x0000001e; /* From GFX8 */
294 
295    /* Set all cleared context registers to saved. */
296    ctx->tracked_regs.reg_saved = BITFIELD64_MASK(SI_TRACKED_GE_PC_ALLOC);
297    ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
298 }
299 
si_install_draw_wrapper(struct si_context * sctx,pipe_draw_vbo_func wrapper,pipe_draw_vertex_state_func vstate_wrapper)300 void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper,
301                              pipe_draw_vertex_state_func vstate_wrapper)
302 {
303    if (wrapper) {
304       if (wrapper != sctx->b.draw_vbo) {
305          assert(!sctx->real_draw_vbo);
306          assert(!sctx->real_draw_vertex_state);
307          sctx->real_draw_vbo = sctx->b.draw_vbo;
308          sctx->real_draw_vertex_state = sctx->b.draw_vertex_state;
309          sctx->b.draw_vbo = wrapper;
310          sctx->b.draw_vertex_state = vstate_wrapper;
311       }
312    } else if (sctx->real_draw_vbo) {
313       sctx->real_draw_vbo = NULL;
314       sctx->real_draw_vertex_state = NULL;
315       si_select_draw_vbo(sctx);
316    }
317 }
318 
si_tmz_preamble(struct si_context * sctx)319 static void si_tmz_preamble(struct si_context *sctx)
320 {
321    bool secure = si_gfx_resources_check_encrypted(sctx);
322    if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) {
323       si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW |
324                             RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL);
325    }
326 }
327 
si_draw_vbo_tmz_preamble(struct pipe_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)328 static void si_draw_vbo_tmz_preamble(struct pipe_context *ctx,
329                                      const struct pipe_draw_info *info,
330                                      unsigned drawid_offset,
331                                      const struct pipe_draw_indirect_info *indirect,
332                                      const struct pipe_draw_start_count_bias *draws,
333                                      unsigned num_draws) {
334    struct si_context *sctx = (struct si_context *)ctx;
335 
336    si_tmz_preamble(sctx);
337    sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws);
338 }
339 
si_draw_vstate_tmz_preamble(struct pipe_context * ctx,struct pipe_vertex_state * state,uint32_t partial_velem_mask,struct pipe_draw_vertex_state_info info,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)340 static void si_draw_vstate_tmz_preamble(struct pipe_context *ctx,
341                                         struct pipe_vertex_state *state,
342                                         uint32_t partial_velem_mask,
343                                         struct pipe_draw_vertex_state_info info,
344                                         const struct pipe_draw_start_count_bias *draws,
345                                         unsigned num_draws) {
346    struct si_context *sctx = (struct si_context *)ctx;
347 
348    si_tmz_preamble(sctx);
349    sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws);
350 }
351 
si_begin_new_gfx_cs(struct si_context * ctx,bool first_cs)352 void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
353 {
354    bool is_secure = false;
355 
356    if (unlikely(radeon_uses_secure_bos(ctx->ws))) {
357       is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs);
358 
359       si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble,
360                               si_draw_vstate_tmz_preamble);
361    }
362 
363    if (ctx->is_debug)
364       si_begin_gfx_cs_debug(ctx);
365 
366    si_add_gds_to_buffer_list(ctx);
367 
368    /* Always invalidate caches at the beginning of IBs, because external
369     * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
370     * buffers.
371     *
372     * Note that the cache flush done by the kernel at the end of GFX IBs
373     * isn't useful here, because that flush can finish after the following
374     * IB starts drawing.
375     *
376     * TODO: Do we also need to invalidate CB & DB caches?
377     */
378    ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
379                  SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS;
380    ctx->pipeline_stats_enabled = -1;
381 
382    /* We don't know if the last draw used NGG because it can be a different process.
383     * When switching NGG->legacy, we need to flush VGT for certain hw generations.
384     */
385    if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg)
386       ctx->flags |= SI_CONTEXT_VGT_FLUSH;
387 
388    if (ctx->border_color_buffer) {
389       radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->border_color_buffer,
390                                 RADEON_USAGE_READ, RADEON_PRIO_BORDER_COLORS);
391    }
392    if (ctx->shadowed_regs) {
393       radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->shadowed_regs,
394                                 RADEON_USAGE_READWRITE,
395                                 RADEON_PRIO_DESCRIPTORS);
396    }
397 
398    si_add_all_descriptors_to_bo_list(ctx);
399 
400    if (first_cs || !ctx->shadowed_regs) {
401       si_shader_pointers_mark_dirty(ctx);
402       ctx->cs_shader_state.initialized = false;
403    }
404 
405    if (!ctx->has_graphics) {
406       ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
407       return;
408    }
409 
410    if (ctx->tess_rings) {
411       radeon_add_to_buffer_list(ctx, &ctx->gfx_cs,
412                                 unlikely(is_secure) ? si_resource(ctx->tess_rings_tmz) : si_resource(ctx->tess_rings),
413                                 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
414    }
415 
416    /* set all valid group as dirty so they get reemited on
417     * next draw command
418     */
419    si_pm4_reset_emitted(ctx, first_cs);
420 
421    /* The CS initialization should be emitted before everything else. */
422    if (ctx->cs_preamble_state)
423       si_pm4_emit(ctx, ctx->cs_preamble_state);
424    if (ctx->cs_preamble_tess_rings)
425       si_pm4_emit(ctx, unlikely(is_secure) ? ctx->cs_preamble_tess_rings_tmz :
426          ctx->cs_preamble_tess_rings);
427    if (ctx->cs_preamble_gs_rings)
428       si_pm4_emit(ctx, ctx->cs_preamble_gs_rings);
429 
430    if (ctx->queued.named.ls)
431       ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
432    if (ctx->queued.named.hs)
433       ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
434    if (ctx->queued.named.es)
435       ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
436    if (ctx->queued.named.gs)
437       ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
438    if (ctx->queued.named.vs)
439       ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
440    if (ctx->queued.named.ps)
441       ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
442 
443    /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
444    bool has_clear_state = ctx->screen->info.has_clear_state;
445    if (has_clear_state || ctx->shadowed_regs) {
446       ctx->framebuffer.dirty_cbufs =
447             u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
448       /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
449       ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
450    } else {
451       ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
452       ctx->framebuffer.dirty_zsbuf = true;
453    }
454 
455    /* Even with shadowed registers, we have to add buffers to the buffer list.
456     * These atoms are the only ones that add buffers.
457     */
458    si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
459    si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
460    if (ctx->screen->use_ngg_culling)
461       si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);
462 
463    if (first_cs || !ctx->shadowed_regs) {
464       /* These don't add any buffers, so skip them with shadowing. */
465       si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
466       /* CLEAR_STATE sets zeros. */
467       if (!has_clear_state || ctx->clip_state_any_nonzeros)
468          si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
469       ctx->sample_locs_num_samples = 0;
470       si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
471       si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
472       /* CLEAR_STATE sets 0xffff. */
473       if (!has_clear_state || ctx->sample_mask != 0xffff)
474          si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
475       si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
476       /* CLEAR_STATE sets zeros. */
477       if (!has_clear_state || ctx->blend_color_any_nonzeros)
478          si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
479       si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
480       if (ctx->chip_class >= GFX9)
481          si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
482       si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
483       si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
484       if (!ctx->screen->use_ngg_streamout)
485          si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
486       /* CLEAR_STATE disables all window rectangles. */
487       if (!has_clear_state || ctx->num_window_rectangles > 0)
488          si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
489       si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
490       si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
491       si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
492 
493       /* Invalidate various draw states so that they are emitted before
494        * the first draw call. */
495       si_invalidate_draw_constants(ctx);
496       ctx->last_index_size = -1;
497       ctx->last_primitive_restart_en = -1;
498       ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
499       ctx->last_prim = -1;
500       ctx->last_multi_vgt_param = -1;
501       ctx->last_vs_state = ~0;
502       ctx->last_ls = NULL;
503       ctx->last_tcs = NULL;
504       ctx->last_tes_sh_base = -1;
505       ctx->last_num_tcs_input_cp = -1;
506       ctx->last_ls_hs_config = -1; /* impossible value */
507       ctx->last_binning_enabled = -1;
508 
509       if (has_clear_state) {
510          si_set_tracked_regs_to_clear_state(ctx);
511       } else {
512          /* Set all register values to unknown. */
513          ctx->tracked_regs.reg_saved = 0;
514          ctx->last_gs_out_prim = -1; /* unknown */
515       }
516 
517       /* 0xffffffff is an impossible value to register SPI_PS_INPUT_CNTL_n */
518       memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
519    }
520 
521    if (ctx->scratch_buffer) {
522       si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
523       si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
524    }
525 
526    if (ctx->streamout.suspended) {
527       ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
528       si_streamout_buffers_dirty(ctx);
529    }
530 
531    if (!list_is_empty(&ctx->active_queries))
532       si_resume_queries(ctx);
533 
534    assert(!ctx->gfx_cs.prev_dw);
535    ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
536 
537    /* All buffer references are removed on a flush, so si_check_needs_implicit_sync
538     * cannot determine if si_make_CB_shader_coherent() needs to be called.
539     * ctx->force_cb_shader_coherent will be cleared by the first call to
540     * si_make_CB_shader_coherent.
541     */
542    ctx->force_cb_shader_coherent = true;
543 }
544 
si_trace_emit(struct si_context * sctx)545 void si_trace_emit(struct si_context *sctx)
546 {
547    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
548    uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
549 
550    si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf, 0, 4, V_370_MEM, V_370_ME, &trace_id);
551 
552    radeon_begin(cs);
553    radeon_emit(PKT3(PKT3_NOP, 0, 0));
554    radeon_emit(AC_ENCODE_TRACE_POINT(trace_id));
555    radeon_end();
556 
557    if (sctx->log)
558       u_log_flush(sctx->log);
559 }
560 
si_emit_surface_sync(struct si_context * sctx,struct radeon_cmdbuf * cs,unsigned cp_coher_cntl)561 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
562 {
563    bool compute_ib = !sctx->has_graphics;
564 
565    assert(sctx->chip_class <= GFX9);
566 
567    /* This seems problematic with GFX7 (see #4764) */
568    if (sctx->chip_class != GFX7)
569       cp_coher_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */
570 
571    radeon_begin(cs);
572 
573    if (sctx->chip_class == GFX9 || compute_ib) {
574       /* Flush caches and wait for the caches to assert idle. */
575       radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0));
576       radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */
577       radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
578       radeon_emit(0xffffff);      /* CP_COHER_SIZE_HI */
579       radeon_emit(0);             /* CP_COHER_BASE */
580       radeon_emit(0);             /* CP_COHER_BASE_HI */
581       radeon_emit(0x0000000A);    /* POLL_INTERVAL */
582    } else {
583       /* ACQUIRE_MEM is only required on a compute ring. */
584       radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
585       radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */
586       radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
587       radeon_emit(0);             /* CP_COHER_BASE */
588       radeon_emit(0x0000000A);    /* POLL_INTERVAL */
589    }
590    radeon_end();
591 
592    /* ACQUIRE_MEM has an implicit context roll if the current context
593     * is busy. */
594    if (!compute_ib)
595       sctx->context_roll = true;
596 }
597 
gfx10_emit_cache_flush(struct si_context * ctx,struct radeon_cmdbuf * cs)598 void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
599 {
600    uint32_t gcr_cntl = 0;
601    unsigned cb_db_event = 0;
602    unsigned flags = ctx->flags;
603 
604    if (!ctx->has_graphics) {
605       /* Only process compute flags. */
606       flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
607                SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
608                SI_CONTEXT_CS_PARTIAL_FLUSH;
609    }
610 
611    /* We don't need these. */
612    assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META)));
613 
614    radeon_begin(cs);
615 
616    if (flags & SI_CONTEXT_VGT_FLUSH) {
617       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
618       radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
619    }
620 
621    if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
622       ctx->num_cb_cache_flushes++;
623    if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
624       ctx->num_db_cache_flushes++;
625 
626    if (flags & SI_CONTEXT_INV_ICACHE)
627       gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
628    if (flags & SI_CONTEXT_INV_SCACHE) {
629       /* TODO: When writing to the SMEM L1 cache, we need to set SEQ
630        * to FORWARD when both L1 and L2 are written out (WB or INV).
631        */
632       gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
633    }
634    if (flags & SI_CONTEXT_INV_VCACHE)
635       gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
636 
637    /* The L2 cache ops are:
638     * - INV: - invalidate lines that reflect memory (were loaded from memory)
639     *        - don't touch lines that were overwritten (were stored by gfx clients)
640     * - WB: - don't touch lines that reflect memory
641     *       - write back lines that were overwritten
642     * - WB | INV: - invalidate lines that reflect memory
643     *             - write back lines that were overwritten
644     *
645     * GLM doesn't support WB alone. If WB is set, INV must be set too.
646     */
647    if (flags & SI_CONTEXT_INV_L2) {
648       /* Writeback and invalidate everything in L2. */
649       gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1);
650       ctx->num_L2_invalidates++;
651    } else if (flags & SI_CONTEXT_WB_L2) {
652       gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1);
653    } else if (flags & SI_CONTEXT_INV_L2_METADATA) {
654       gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
655    }
656 
657    if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
658       if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
659          /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
660          radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
661          radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
662       }
663       if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
664          /* Flush HTILE. Will wait for idle later. */
665          radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
666          radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
667       }
668 
669       /* First flush CB/DB, then L1/L2. */
670       gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
671 
672       if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
673           (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
674          cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
675       } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
676          cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
677       } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
678          cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
679       } else {
680          assert(0);
681       }
682    } else {
683       /* Wait for graphics shaders to go idle if requested. */
684       if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
685          radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
686          radeon_emit(EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
687          /* Only count explicit shader flushes, not implicit ones. */
688          ctx->num_vs_flushes++;
689          ctx->num_ps_flushes++;
690       } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
691          radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
692          radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
693          ctx->num_vs_flushes++;
694       }
695    }
696 
697    if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
698       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
699       radeon_emit(EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
700       ctx->num_cs_flushes++;
701       ctx->compute_is_busy = false;
702    }
703    radeon_end();
704 
705    if (cb_db_event) {
706       struct si_resource* wait_mem_scratch = unlikely(ctx->ws->cs_is_secure(cs)) ?
707         ctx->wait_mem_scratch_tmz : ctx->wait_mem_scratch;
708       /* CB/DB flush and invalidate (or possibly just a wait for a
709        * meta flush) via RELEASE_MEM.
710        *
711        * Combine this with other cache flushes when possible; this
712        * requires affected shaders to be idle, so do it after the
713        * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
714        * implied).
715        */
716       uint64_t va;
717 
718       /* Do the flush (enqueue the event and wait for it). */
719       va = wait_mem_scratch->gpu_address;
720       ctx->wait_mem_number++;
721 
722       /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
723       unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
724       unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
725       unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
726       unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
727       assert(G_586_GL2_US(gcr_cntl) == 0);
728       assert(G_586_GL2_RANGE(gcr_cntl) == 0);
729       assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
730       unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
731       unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
732       unsigned gcr_seq = G_586_SEQ(gcr_cntl);
733 
734       gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
735                   C_586_GL2_WB; /* keep SEQ */
736 
737       si_cp_release_mem(ctx, cs, cb_db_event,
738                         S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
739                            S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
740                            S_490_SEQ(gcr_seq),
741                         EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
742                         EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number,
743                         SI_NOT_QUERY);
744 
745       if (unlikely(ctx->thread_trace_enabled)) {
746          si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs);
747       }
748 
749       si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
750 
751       if (unlikely(ctx->thread_trace_enabled)) {
752          si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags);
753       }
754    }
755 
756    radeon_begin_again(cs);
757 
758    /* Ignore fields that only modify the behavior of other fields. */
759    if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
760       unsigned dont_sync_pfp = (!(flags & SI_CONTEXT_PFP_SYNC_ME)) << 31;
761 
762       /* Flush caches and wait for the caches to assert idle.
763        * The cache flush is executed in the ME, but the PFP waits
764        * for completion.
765        */
766       radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
767       radeon_emit(dont_sync_pfp); /* CP_COHER_CNTL */
768       radeon_emit(0xffffffff); /* CP_COHER_SIZE */
769       radeon_emit(0xffffff);   /* CP_COHER_SIZE_HI */
770       radeon_emit(0);          /* CP_COHER_BASE */
771       radeon_emit(0);          /* CP_COHER_BASE_HI */
772       radeon_emit(0x0000000A); /* POLL_INTERVAL */
773       radeon_emit(gcr_cntl);   /* GCR_CNTL */
774    } else if (flags & SI_CONTEXT_PFP_SYNC_ME) {
775       /* Synchronize PFP with ME. (this stalls PFP) */
776       radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
777       radeon_emit(0);
778    }
779 
780    if (flags & SI_CONTEXT_START_PIPELINE_STATS && ctx->pipeline_stats_enabled != 1) {
781       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
782       radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
783       ctx->pipeline_stats_enabled = 1;
784    } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS && ctx->pipeline_stats_enabled != 0) {
785       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
786       radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
787       ctx->pipeline_stats_enabled = 0;
788    }
789    radeon_end();
790 
791    ctx->flags = 0;
792 }
793 
si_emit_cache_flush(struct si_context * sctx,struct radeon_cmdbuf * cs)794 void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
795 {
796    uint32_t flags = sctx->flags;
797 
798    if (!sctx->has_graphics) {
799       /* Only process compute flags. */
800       flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
801                SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
802                SI_CONTEXT_CS_PARTIAL_FLUSH;
803    }
804 
805    uint32_t cp_coher_cntl = 0;
806    const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
807 
808    assert(sctx->chip_class <= GFX9);
809 
810    if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
811       sctx->num_cb_cache_flushes++;
812    if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
813       sctx->num_db_cache_flushes++;
814 
815    /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
816     * bit is set. An alternative way is to write SQC_CACHES, but that
817     * doesn't seem to work reliably. Since the bug doesn't affect
818     * correctness (it only does more work than necessary) and
819     * the performance impact is likely negligible, there is no plan
820     * to add a workaround for it.
821     */
822 
823    if (flags & SI_CONTEXT_INV_ICACHE)
824       cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
825    if (flags & SI_CONTEXT_INV_SCACHE)
826       cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
827 
828    if (sctx->chip_class <= GFX8) {
829       if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
830          cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
831                           S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
832                           S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
833                           S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
834                           S_0085F0_CB7_DEST_BASE_ENA(1);
835 
836          /* Necessary for DCC */
837          if (sctx->chip_class == GFX8)
838             si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
839                               EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
840       }
841       if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
842          cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
843    }
844 
845    radeon_begin(cs);
846 
847    if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
848       /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
849       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
850       radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
851    }
852    if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
853       /* Flush HTILE. SURFACE_SYNC will wait for idle. */
854       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
855       radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
856    }
857 
858    /* Wait for shader engines to go idle.
859     * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
860     * for everything including CB/DB cache flushes.
861     */
862    if (!flush_cb_db) {
863       if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
864          radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
865          radeon_emit(EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
866          /* Only count explicit shader flushes, not implicit ones
867           * done by SURFACE_SYNC.
868           */
869          sctx->num_vs_flushes++;
870          sctx->num_ps_flushes++;
871       } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
872          radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
873          radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
874          sctx->num_vs_flushes++;
875       }
876    }
877 
878    if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {
879       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
880       radeon_emit(EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
881       sctx->num_cs_flushes++;
882       sctx->compute_is_busy = false;
883    }
884 
885    /* VGT state synchronization. */
886    if (flags & SI_CONTEXT_VGT_FLUSH) {
887       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
888       radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
889    }
890    if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
891       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
892       radeon_emit(EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
893    }
894 
895    radeon_end();
896 
897    /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
898     * wait for idle on GFX9. We have to use a TS event.
899     */
900    if (sctx->chip_class == GFX9 && flush_cb_db) {
901       uint64_t va;
902       unsigned tc_flags, cb_db_event;
903 
904       /* Set the CB/DB flush event. */
905       switch (flush_cb_db) {
906       case SI_CONTEXT_FLUSH_AND_INV_CB:
907          cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
908          break;
909       case SI_CONTEXT_FLUSH_AND_INV_DB:
910          cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
911          break;
912       default:
913          /* both CB & DB */
914          cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
915       }
916 
917       /* These are the only allowed combinations. If you need to
918        * do multiple operations at once, do them separately.
919        * All operations that invalidate L2 also seem to invalidate
920        * metadata. Volatile (VOL) and WC flushes are not listed here.
921        *
922        * TC    | TC_WB         = writeback & invalidate L2 & L1
923        * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
924        *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
925        * TC            | TC_NC = invalidate L2 for MTYPE == NC
926        * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
927        * TCL1                  = invalidate L1
928        */
929       tc_flags = 0;
930 
931       if (flags & SI_CONTEXT_INV_L2_METADATA) {
932          tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
933       }
934 
935       /* Ideally flush TC together with CB/DB. */
936       if (flags & SI_CONTEXT_INV_L2) {
937          /* Writeback and invalidate everything in L2 & L1. */
938          tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
939 
940          /* Clear the flags. */
941          flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE);
942          sctx->num_L2_invalidates++;
943       }
944 
945       /* Do the flush (enqueue the event and wait for it). */
946       struct si_resource* wait_mem_scratch = unlikely(sctx->ws->cs_is_secure(cs)) ?
947         sctx->wait_mem_scratch_tmz : sctx->wait_mem_scratch;
948       va = wait_mem_scratch->gpu_address;
949       sctx->wait_mem_number++;
950 
951       si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
952                         EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
953                         wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
954 
955       if (unlikely(sctx->thread_trace_enabled)) {
956          si_sqtt_describe_barrier_start(sctx, &sctx->gfx_cs);
957       }
958 
959       si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
960 
961       if (unlikely(sctx->thread_trace_enabled)) {
962          si_sqtt_describe_barrier_end(sctx, &sctx->gfx_cs, sctx->flags);
963       }
964    }
965 
966    /* GFX6-GFX8 only:
967     *   When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
968     *   waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
969     *
970     * cp_coher_cntl should contain all necessary flags except TC and PFP flags
971     * at this point.
972     *
973     * GFX6-GFX7 don't support L2 write-back.
974     */
975    if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {
976       /* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
977        * WB must be set on GFX8+ when TC_ACTION is set.
978        */
979       si_emit_surface_sync(sctx, cs,
980                            cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
981                               S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
982       cp_coher_cntl = 0;
983       sctx->num_L2_invalidates++;
984    } else {
985       /* L1 invalidation and L2 writeback must be done separately,
986        * because both operations can't be done together.
987        */
988       if (flags & SI_CONTEXT_WB_L2) {
989          /* WB = write-back
990           * NC = apply to non-coherent MTYPEs
991           *      (i.e. MTYPE <= 1, which is what we use everywhere)
992           *
993           * WB doesn't work without NC.
994           */
995          si_emit_surface_sync(
996             sctx, cs,
997             cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
998          cp_coher_cntl = 0;
999          sctx->num_L2_writebacks++;
1000       }
1001       if (flags & SI_CONTEXT_INV_VCACHE) {
1002          /* Invalidate per-CU VMEM L1. */
1003          si_emit_surface_sync(sctx, cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
1004          cp_coher_cntl = 0;
1005       }
1006    }
1007 
1008    /* If TC flushes haven't cleared this... */
1009    if (cp_coher_cntl)
1010       si_emit_surface_sync(sctx, cs, cp_coher_cntl);
1011 
1012    if (flags & SI_CONTEXT_PFP_SYNC_ME) {
1013       radeon_begin(cs);
1014       radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
1015       radeon_emit(0);
1016       radeon_end();
1017    }
1018 
1019    if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) {
1020       radeon_begin(cs);
1021       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
1022       radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
1023       radeon_end();
1024       sctx->pipeline_stats_enabled = 1;
1025    } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS && sctx->pipeline_stats_enabled != 0) {
1026       radeon_begin(cs);
1027       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
1028       radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
1029       radeon_end();
1030       sctx->pipeline_stats_enabled = 0;
1031    }
1032 
1033    sctx->flags = 0;
1034 }
1035