1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_format.h"
30 #include "vk_render_pass.h"
31 #include "vk_util.h"
32 #include "util/fast_idiv_by_const.h"
33 
34 #include "common/intel_aux_map.h"
35 #include "common/intel_l3_config.h"
36 #include "genxml/gen_macros.h"
37 #include "genxml/genX_pack.h"
38 #include "genxml/gen_rt_pack.h"
39 #include "common/intel_guardband.h"
40 
41 #include "nir/nir_xfb_info.h"
42 
43 #include "ds/intel_tracepoints.h"
44 
45 /* We reserve :
46  *    - GPR 14 for secondary command buffer returns
47  *    - GPR 15 for conditional rendering
48  */
49 #define MI_BUILDER_NUM_ALLOC_GPRS 14
50 #define __gen_get_batch_dwords anv_batch_emit_dwords
51 #define __gen_address_offset anv_address_add
52 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
53 #include "common/mi_builder.h"
54 
55 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
56                                         uint32_t pipeline);
57 
58 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)59 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
60    enum anv_pipe_bits bits = 0;
61    bits |= (pc->DepthCacheFlushEnable) ?  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
62    bits |= (pc->DCFlushEnable) ?  ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
63 #if GFX_VERx10 >= 125
64    bits |= (pc->PSSStallSyncEnable) ?  ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
65 #endif
66 #if GFX_VER >= 12
67    bits |= (pc->TileCacheFlushEnable) ?  ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
68    bits |= (pc->HDCPipelineFlushEnable) ?  ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
69 #endif
70    bits |= (pc->RenderTargetCacheFlushEnable) ?  ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
71    bits |= (pc->VFCacheInvalidationEnable) ?  ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
72    bits |= (pc->StateCacheInvalidationEnable) ?  ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
73    bits |= (pc->ConstantCacheInvalidationEnable) ?  ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
74    bits |= (pc->TextureCacheInvalidationEnable) ?  ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
75    bits |= (pc->InstructionCacheInvalidateEnable) ?  ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
76    bits |= (pc->StallAtPixelScoreboard) ?  ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
77    bits |= (pc->DepthStallEnable) ?  ANV_PIPE_DEPTH_STALL_BIT : 0;
78    bits |= (pc->CommandStreamerStallEnable) ?  ANV_PIPE_CS_STALL_BIT : 0;
79    return bits;
80 }
81 
82 #define anv_debug_dump_pc(pc) \
83    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
84       fputs("pc: emit PC=( ", stderr); \
85       anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
86       fprintf(stderr, ") reason: %s\n", __FUNCTION__); \
87    }
88 
89 static bool
is_render_queue_cmd_buffer(const struct anv_cmd_buffer * cmd_buffer)90 is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)
91 {
92    struct anv_queue_family *queue_family = cmd_buffer->queue_family;
93    return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
94 }
95 
96 void
genX(cmd_buffer_emit_state_base_address)97 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
98 {
99    struct anv_device *device = cmd_buffer->device;
100    uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
101 
102    /* If we are emitting a new state base address we probably need to re-emit
103     * binding tables.
104     */
105    cmd_buffer->state.descriptors_dirty |= ~0;
106 
107 #if GFX_VERx10 >= 125
108    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
109       pc.CommandStreamerStallEnable = true;
110       anv_debug_dump_pc(pc);
111    }
112    anv_batch_emit(
113       &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
114       btpa.BindingTablePoolBaseAddress =
115          anv_cmd_buffer_surface_base_address(cmd_buffer);
116       btpa.BindingTablePoolBufferSize = BINDING_TABLE_POOL_BLOCK_SIZE / 4096;
117       btpa.MOCS = mocs;
118    }
119 #else /* GFX_VERx10 < 125 */
120    /* Emit a render target cache flush.
121     *
122     * This isn't documented anywhere in the PRM.  However, it seems to be
123     * necessary prior to changing the surface state base adress.  Without
124     * this, we get GPU hangs when using multi-level command buffers which
125     * clear depth, reset state base address, and then go render stuff.
126     */
127    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
128 #if GFX_VER >= 12
129       pc.HDCPipelineFlushEnable = true;
130 #else
131       pc.DCFlushEnable = true;
132 #endif
133       pc.RenderTargetCacheFlushEnable = true;
134       pc.CommandStreamerStallEnable = true;
135       anv_debug_dump_pc(pc);
136    }
137 
138 #if GFX_VERx10 == 120
139    /* Wa_1607854226:
140     *
141     *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
142     *  mode by putting the pipeline temporarily in 3D mode.
143     */
144    uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
145    genX(flush_pipeline_select_3d)(cmd_buffer);
146 #endif
147 
148    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
149       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
150       sba.GeneralStateMOCS = mocs;
151       sba.GeneralStateBaseAddressModifyEnable = true;
152 
153       sba.StatelessDataPortAccessMOCS = mocs;
154 
155       sba.SurfaceStateBaseAddress =
156          anv_cmd_buffer_surface_base_address(cmd_buffer);
157       sba.SurfaceStateMOCS = mocs;
158       sba.SurfaceStateBaseAddressModifyEnable = true;
159 
160       sba.DynamicStateBaseAddress =
161          (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
162       sba.DynamicStateMOCS = mocs;
163       sba.DynamicStateBaseAddressModifyEnable = true;
164 
165       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
166       sba.IndirectObjectMOCS = mocs;
167       sba.IndirectObjectBaseAddressModifyEnable = true;
168 
169       sba.InstructionBaseAddress =
170          (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
171       sba.InstructionMOCS = mocs;
172       sba.InstructionBaseAddressModifyEnable = true;
173 
174 #  if (GFX_VER >= 8)
175       /* Broadwell requires that we specify a buffer size for a bunch of
176        * these fields.  However, since we will be growing the BO's live, we
177        * just set them all to the maximum.
178        */
179       sba.GeneralStateBufferSize       = 0xfffff;
180       sba.IndirectObjectBufferSize     = 0xfffff;
181       if (anv_use_relocations(device->physical)) {
182          sba.DynamicStateBufferSize    = 0xfffff;
183          sba.InstructionBufferSize     = 0xfffff;
184       } else {
185          /* With softpin, we use fixed addresses so we actually know how big
186           * our base addresses are.
187           */
188          sba.DynamicStateBufferSize    = DYNAMIC_STATE_POOL_SIZE / 4096;
189          sba.InstructionBufferSize     = INSTRUCTION_STATE_POOL_SIZE / 4096;
190       }
191       sba.GeneralStateBufferSizeModifyEnable    = true;
192       sba.IndirectObjectBufferSizeModifyEnable  = true;
193       sba.DynamicStateBufferSizeModifyEnable    = true;
194       sba.InstructionBuffersizeModifyEnable     = true;
195 #  else
196       /* On gfx7, we have upper bounds instead.  According to the docs,
197        * setting an upper bound of zero means that no bounds checking is
198        * performed so, in theory, we should be able to leave them zero.
199        * However, border color is broken and the GPU bounds-checks anyway.
200        * To avoid this and other potential problems, we may as well set it
201        * for everything.
202        */
203       sba.GeneralStateAccessUpperBound =
204          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
205       sba.GeneralStateAccessUpperBoundModifyEnable = true;
206       sba.DynamicStateAccessUpperBound =
207          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
208       sba.DynamicStateAccessUpperBoundModifyEnable = true;
209       sba.InstructionAccessUpperBound =
210          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
211       sba.InstructionAccessUpperBoundModifyEnable = true;
212 #  endif
213 #  if (GFX_VER >= 9)
214       sba.BindlessSurfaceStateBaseAddress =
215          (struct anv_address) { device->surface_state_pool.block_pool.bo, 0 };
216       sba.BindlessSurfaceStateSize = (1 << 20) - 1;
217       sba.BindlessSurfaceStateMOCS = mocs;
218       sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
219 #  endif
220 #  if (GFX_VER >= 10)
221       sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
222       sba.BindlessSamplerStateMOCS = mocs;
223       sba.BindlessSamplerStateBaseAddressModifyEnable = true;
224       sba.BindlessSamplerStateBufferSize = 0;
225 #  endif
226    }
227 
228 #if GFX_VERx10 == 120
229    /* Wa_1607854226:
230     *
231     *  Put the pipeline back into its current mode.
232     */
233    if (gfx12_wa_pipeline != UINT32_MAX)
234       genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
235 #endif
236 
237 #endif /* GFX_VERx10 < 125 */
238 
239    /* After re-setting the surface state base address, we have to do some
240     * cache flusing so that the sampler engine will pick up the new
241     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
242     * Shared Function > 3D Sampler > State > State Caching (page 96):
243     *
244     *    Coherency with system memory in the state cache, like the texture
245     *    cache is handled partially by software. It is expected that the
246     *    command stream or shader will issue Cache Flush operation or
247     *    Cache_Flush sampler message to ensure that the L1 cache remains
248     *    coherent with system memory.
249     *
250     *    [...]
251     *
252     *    Whenever the value of the Dynamic_State_Base_Addr,
253     *    Surface_State_Base_Addr are altered, the L1 state cache must be
254     *    invalidated to ensure the new surface or sampler state is fetched
255     *    from system memory.
256     *
257     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
258     * which, according the PIPE_CONTROL instruction documentation in the
259     * Broadwell PRM:
260     *
261     *    Setting this bit is independent of any other bit in this packet.
262     *    This bit controls the invalidation of the L1 and L2 state caches
263     *    at the top of the pipe i.e. at the parsing time.
264     *
265     * Unfortunately, experimentation seems to indicate that state cache
266     * invalidation through a PIPE_CONTROL does nothing whatsoever in
267     * regards to surface state and binding tables.  In stead, it seems that
268     * invalidating the texture cache is what is actually needed.
269     *
270     * XXX:  As far as we have been able to determine through
271     * experimentation, shows that flush the texture cache appears to be
272     * sufficient.  The theory here is that all of the sampling/rendering
273     * units cache the binding table in the texture cache.  However, we have
274     * yet to be able to actually confirm this.
275     *
276     * Wa_14013910100:
277     *
278     *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
279     *   or program pipe control with Instruction cache invalidate post
280     *   STATE_BASE_ADDRESS command"
281     */
282    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
283       pc.TextureCacheInvalidationEnable = true;
284       pc.ConstantCacheInvalidationEnable = true;
285       pc.StateCacheInvalidationEnable = true;
286 #if GFX_VERx10 == 125
287       pc.InstructionCacheInvalidateEnable = true;
288 #endif
289       anv_debug_dump_pc(pc);
290    }
291 }
292 
293 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_state state,struct anv_address addr)294 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
295                   struct anv_state state, struct anv_address addr)
296 {
297    VkResult result;
298 
299    if (anv_use_relocations(cmd_buffer->device->physical)) {
300       const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
301       result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
302                                   &cmd_buffer->vk.pool->alloc,
303                                   state.offset + isl_dev->ss.addr_offset,
304                                   addr.bo, addr.offset, NULL);
305    } else {
306       result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
307                                      &cmd_buffer->vk.pool->alloc,
308                                      addr.bo);
309    }
310 
311    if (unlikely(result != VK_SUCCESS))
312       anv_batch_set_error(&cmd_buffer->batch, result);
313 }
314 
315 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,struct anv_surface_state state)316 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
317                          struct anv_surface_state state)
318 {
319    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
320 
321    assert(!anv_address_is_null(state.address));
322    add_surface_reloc(cmd_buffer, state.state, state.address);
323 
324    if (!anv_address_is_null(state.aux_address)) {
325       VkResult result =
326          anv_reloc_list_add(&cmd_buffer->surface_relocs,
327                             &cmd_buffer->vk.pool->alloc,
328                             state.state.offset + isl_dev->ss.aux_addr_offset,
329                             state.aux_address.bo,
330                             state.aux_address.offset,
331                             NULL);
332       if (result != VK_SUCCESS)
333          anv_batch_set_error(&cmd_buffer->batch, result);
334    }
335 
336    if (!anv_address_is_null(state.clear_address)) {
337       VkResult result =
338          anv_reloc_list_add(&cmd_buffer->surface_relocs,
339                             &cmd_buffer->vk.pool->alloc,
340                             state.state.offset +
341                             isl_dev->ss.clear_color_state_offset,
342                             state.clear_address.bo,
343                             state.clear_address.offset,
344                             NULL);
345       if (result != VK_SUCCESS)
346          anv_batch_set_error(&cmd_buffer->batch, result);
347    }
348 }
349 
350 static bool
isl_color_value_requires_conversion(union isl_color_value color,const struct isl_surf * surf,const struct isl_view * view)351 isl_color_value_requires_conversion(union isl_color_value color,
352                                     const struct isl_surf *surf,
353                                     const struct isl_view *view)
354 {
355    if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
356       return false;
357 
358    uint32_t surf_pack[4] = { 0, 0, 0, 0 };
359    isl_color_value_pack(&color, surf->format, surf_pack);
360 
361    uint32_t view_pack[4] = { 0, 0, 0, 0 };
362    union isl_color_value swiz_color =
363       isl_color_value_swizzle_inv(color, view->swizzle);
364    isl_color_value_pack(&swiz_color, view->format, view_pack);
365 
366    return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
367 }
368 
369 static bool
anv_can_fast_clear_color_view(struct anv_device * device,struct anv_image_view * iview,VkImageLayout layout,union isl_color_value clear_color,uint32_t num_layers,VkRect2D render_area)370 anv_can_fast_clear_color_view(struct anv_device * device,
371                               struct anv_image_view *iview,
372                               VkImageLayout layout,
373                               union isl_color_value clear_color,
374                               uint32_t num_layers,
375                               VkRect2D render_area)
376 {
377    if (iview->planes[0].isl.base_array_layer >=
378        anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
379                             iview->planes[0].isl.base_level))
380       return false;
381 
382    /* Start by getting the fast clear type.  We use the first subpass
383     * layout here because we don't want to fast-clear if the first subpass
384     * to use the attachment can't handle fast-clears.
385     */
386    enum anv_fast_clear_type fast_clear_type =
387       anv_layout_to_fast_clear_type(&device->info, iview->image,
388                                     VK_IMAGE_ASPECT_COLOR_BIT,
389                                     layout);
390    switch (fast_clear_type) {
391    case ANV_FAST_CLEAR_NONE:
392       return false;
393    case ANV_FAST_CLEAR_DEFAULT_VALUE:
394       if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
395          return false;
396       break;
397    case ANV_FAST_CLEAR_ANY:
398       break;
399    }
400 
401    /* Potentially, we could do partial fast-clears but doing so has crazy
402     * alignment restrictions.  It's easier to just restrict to full size
403     * fast clears for now.
404     */
405    if (render_area.offset.x != 0 ||
406        render_area.offset.y != 0 ||
407        render_area.extent.width != iview->vk.extent.width ||
408        render_area.extent.height != iview->vk.extent.height)
409       return false;
410 
411    /* On Broadwell and earlier, we can only handle 0/1 clear colors */
412    if (GFX_VER <= 8 &&
413        !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
414       return false;
415 
416    /* If the clear color is one that would require non-trivial format
417     * conversion on resolve, we don't bother with the fast clear.  This
418     * shouldn't be common as most clear colors are 0/1 and the most common
419     * format re-interpretation is for sRGB.
420     */
421    if (isl_color_value_requires_conversion(clear_color,
422                                            &iview->image->planes[0].primary_surface.isl,
423                                            &iview->planes[0].isl)) {
424       anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
425                     "Cannot fast-clear to colors which would require "
426                     "format conversion on resolve");
427       return false;
428    }
429 
430    /* We only allow fast clears to the first slice of an image (level 0,
431     * layer 0) and only for the entire slice.  This guarantees us that, at
432     * any given time, there is only one clear color on any given image at
433     * any given time.  At the time of our testing (Jan 17, 2018), there
434     * were no known applications which would benefit from fast-clearing
435     * more than just the first slice.
436     */
437    if (iview->planes[0].isl.base_level > 0 ||
438        iview->planes[0].isl.base_array_layer > 0) {
439       anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
440                     "Rendering with multi-lod or multi-layer framebuffer "
441                     "with LOAD_OP_LOAD and baseMipLevel > 0 or "
442                     "baseArrayLayer > 0.  Not fast clearing.");
443       return false;
444    }
445 
446    if (num_layers > 1) {
447       anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
448                     "Rendering to a multi-layer framebuffer with "
449                     "LOAD_OP_CLEAR.  Only fast-clearing the first slice");
450    }
451 
452    return true;
453 }
454 
455 static bool
anv_can_hiz_clear_ds_view(struct anv_device * device,const struct anv_image_view * iview,VkImageLayout layout,VkImageAspectFlags clear_aspects,float depth_clear_value,VkRect2D render_area)456 anv_can_hiz_clear_ds_view(struct anv_device *device,
457                           const struct anv_image_view *iview,
458                           VkImageLayout layout,
459                           VkImageAspectFlags clear_aspects,
460                           float depth_clear_value,
461                           VkRect2D render_area)
462 {
463    /* We don't do any HiZ or depth fast-clears on gfx7 yet */
464    if (GFX_VER == 7)
465       return false;
466 
467    /* If we're just clearing stencil, we can always HiZ clear */
468    if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
469       return true;
470 
471    /* We must have depth in order to have HiZ */
472    if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
473       return false;
474 
475    const enum isl_aux_usage clear_aux_usage =
476       anv_layout_to_aux_usage(&device->info, iview->image,
477                               VK_IMAGE_ASPECT_DEPTH_BIT,
478                               VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
479                               layout);
480    if (!blorp_can_hiz_clear_depth(&device->info,
481                                   &iview->image->planes[0].primary_surface.isl,
482                                   clear_aux_usage,
483                                   iview->planes[0].isl.base_level,
484                                   iview->planes[0].isl.base_array_layer,
485                                   render_area.offset.x,
486                                   render_area.offset.y,
487                                   render_area.offset.x +
488                                   render_area.extent.width,
489                                   render_area.offset.y +
490                                   render_area.extent.height))
491       return false;
492 
493    if (depth_clear_value != ANV_HZ_FC_VAL)
494       return false;
495 
496    /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
497     * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
498     * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
499     */
500    if (GFX_VER == 8 && anv_can_sample_with_hiz(&device->info, iview->image))
501       return false;
502 
503    /* If we got here, then we can fast clear */
504    return true;
505 }
506 
507 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
508 
509 #if GFX_VER == 12
510 static void
anv_image_init_aux_tt(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count)511 anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
512                       const struct anv_image *image,
513                       VkImageAspectFlagBits aspect,
514                       uint32_t base_level, uint32_t level_count,
515                       uint32_t base_layer, uint32_t layer_count)
516 {
517    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
518 
519    const struct anv_surface *surface = &image->planes[plane].primary_surface;
520    uint64_t base_address =
521       anv_address_physical(anv_image_address(image, &surface->memory_range));
522 
523    const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl;
524    uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf);
525 
526    /* We're about to live-update the AUX-TT.  We really don't want anyone else
527     * trying to read it while we're doing this.  We could probably get away
528     * with not having this stall in some cases if we were really careful but
529     * it's better to play it safe.  Full stall the GPU.
530     */
531    anv_add_pending_pipe_bits(cmd_buffer,
532                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
533                              "before update AUX-TT");
534    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
535 
536    struct mi_builder b;
537    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
538 
539    for (uint32_t a = 0; a < layer_count; a++) {
540       const uint32_t layer = base_layer + a;
541 
542       uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
543       for (uint32_t l = 0; l < level_count; l++) {
544          const uint32_t level = base_level + l;
545 
546          uint32_t logical_array_layer, logical_z_offset_px;
547          if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
548             logical_array_layer = 0;
549 
550             /* If the given miplevel does not have this layer, then any higher
551              * miplevels won't either because miplevels only get smaller the
552              * higher the LOD.
553              */
554             assert(layer < image->vk.extent.depth);
555             if (layer >= anv_minify(image->vk.extent.depth, level))
556                break;
557             logical_z_offset_px = layer;
558          } else {
559             assert(layer < image->vk.array_layers);
560             logical_array_layer = layer;
561             logical_z_offset_px = 0;
562          }
563 
564          uint64_t slice_start_offset_B, slice_end_offset_B;
565          isl_surf_get_image_range_B_tile(isl_surf, level,
566                                          logical_array_layer,
567                                          logical_z_offset_px,
568                                          &slice_start_offset_B,
569                                          &slice_end_offset_B);
570 
571          start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
572          end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
573       }
574 
575       /* Aux operates 64K at a time */
576       start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
577       end_offset_B = align_u64(end_offset_B, 64 * 1024);
578 
579       for (uint64_t offset = start_offset_B;
580            offset < end_offset_B; offset += 64 * 1024) {
581          uint64_t address = base_address + offset;
582 
583          uint64_t aux_entry_addr64, *aux_entry_map;
584          aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
585                                                  address, &aux_entry_addr64);
586 
587          assert(!anv_use_relocations(cmd_buffer->device->physical));
588          struct anv_address aux_entry_address = {
589             .bo = NULL,
590             .offset = aux_entry_addr64,
591          };
592 
593          const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
594          uint64_t new_aux_entry =
595             (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits;
596 
597          if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))
598             new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT;
599 
600          mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry));
601       }
602    }
603 
604    anv_add_pending_pipe_bits(cmd_buffer,
605                              ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
606                              "after update AUX-TT");
607 }
608 #endif /* GFX_VER == 12 */
609 
610 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
611  * the initial layout is undefined, the HiZ buffer and depth buffer will
612  * represent the same data at the end of this operation.
613  */
614 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)615 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
616                         const struct anv_image *image,
617                         uint32_t base_layer, uint32_t layer_count,
618                         VkImageLayout initial_layout,
619                         VkImageLayout final_layout,
620                         bool will_full_fast_clear)
621 {
622    const uint32_t depth_plane =
623       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
624    if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
625       return;
626 
627 #if GFX_VER == 12
628    if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
629         initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
630        cmd_buffer->device->physical->has_implicit_ccs &&
631        cmd_buffer->device->info.has_aux_map) {
632       anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
633                             0, 1, base_layer, layer_count);
634    }
635 #endif
636 
637    /* If will_full_fast_clear is set, the caller promises to fast-clear the
638     * largest portion of the specified range as it can.  For depth images,
639     * that means the entire image because we don't support multi-LOD HiZ.
640     */
641    assert(image->planes[0].primary_surface.isl.levels == 1);
642    if (will_full_fast_clear)
643       return;
644 
645    const enum isl_aux_state initial_state =
646       anv_layout_to_aux_state(&cmd_buffer->device->info, image,
647                               VK_IMAGE_ASPECT_DEPTH_BIT,
648                               initial_layout);
649    const enum isl_aux_state final_state =
650       anv_layout_to_aux_state(&cmd_buffer->device->info, image,
651                               VK_IMAGE_ASPECT_DEPTH_BIT,
652                               final_layout);
653 
654    const bool initial_depth_valid =
655       isl_aux_state_has_valid_primary(initial_state);
656    const bool initial_hiz_valid =
657       isl_aux_state_has_valid_aux(initial_state);
658    const bool final_needs_depth =
659       isl_aux_state_has_valid_primary(final_state);
660    const bool final_needs_hiz =
661       isl_aux_state_has_valid_aux(final_state);
662 
663    /* Getting into the pass-through state for Depth is tricky and involves
664     * both a resolve and an ambiguate.  We don't handle that state right now
665     * as anv_layout_to_aux_state never returns it.
666     */
667    assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
668 
669    if (final_needs_depth && !initial_depth_valid) {
670       assert(initial_hiz_valid);
671       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
672                        0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
673    } else if (final_needs_hiz && !initial_hiz_valid) {
674       assert(initial_depth_valid);
675       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
676                        0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
677    }
678 }
679 
680 #if GFX_VER == 7
681 static inline bool
vk_image_layout_stencil_write_optimal(VkImageLayout layout)682 vk_image_layout_stencil_write_optimal(VkImageLayout layout)
683 {
684    return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
685           layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
686           layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR;
687 }
688 #endif
689 
690 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
691  * the initial layout is undefined, the HiZ buffer and depth buffer will
692  * represent the same data at the end of this operation.
693  */
694 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)695 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
696                           const struct anv_image *image,
697                           uint32_t base_level, uint32_t level_count,
698                           uint32_t base_layer, uint32_t layer_count,
699                           VkImageLayout initial_layout,
700                           VkImageLayout final_layout,
701                           bool will_full_fast_clear)
702 {
703 #if GFX_VER == 7
704    const uint32_t plane =
705       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
706 
707    /* On gfx7, we have to store a texturable version of the stencil buffer in
708     * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
709     * forth at strategic points. Stencil writes are only allowed in following
710     * layouts:
711     *
712     *  - VK_IMAGE_LAYOUT_GENERAL
713     *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
714     *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
715     *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
716     *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR
717     *
718     * For general, we have no nice opportunity to transition so we do the copy
719     * to the shadow unconditionally at the end of the subpass. For transfer
720     * destinations, we can update it as part of the transfer op. For the other
721     * layouts, we delay the copy until a transition into some other layout.
722     */
723    if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
724        vk_image_layout_stencil_write_optimal(initial_layout) &&
725        !vk_image_layout_stencil_write_optimal(final_layout)) {
726       anv_image_copy_to_shadow(cmd_buffer, image,
727                                VK_IMAGE_ASPECT_STENCIL_BIT,
728                                base_level, level_count,
729                                base_layer, layer_count);
730    }
731 #elif GFX_VER == 12
732    const uint32_t plane =
733       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
734    if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
735       return;
736 
737    if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
738         initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
739        cmd_buffer->device->physical->has_implicit_ccs &&
740        cmd_buffer->device->info.has_aux_map) {
741       anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
742                             base_level, level_count, base_layer, layer_count);
743 
744       /* If will_full_fast_clear is set, the caller promises to fast-clear the
745        * largest portion of the specified range as it can.
746        */
747       if (will_full_fast_clear)
748          return;
749 
750       for (uint32_t l = 0; l < level_count; l++) {
751          const uint32_t level = base_level + l;
752          const VkRect2D clear_rect = {
753             .offset.x = 0,
754             .offset.y = 0,
755             .extent.width = anv_minify(image->vk.extent.width, level),
756             .extent.height = anv_minify(image->vk.extent.height, level),
757          };
758 
759          uint32_t aux_layers =
760             anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
761          uint32_t level_layer_count =
762             MIN2(layer_count, aux_layers - base_layer);
763 
764          /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
765           * Enable:
766           *
767           *    "When enabled, Stencil Buffer needs to be initialized via
768           *    stencil clear (HZ_OP) before any renderpass."
769           */
770          anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
771                              level, base_layer, level_layer_count,
772                              clear_rect, 0 /* Stencil clear value */);
773       }
774    }
775 #endif
776 }
777 
778 #define MI_PREDICATE_SRC0    0x2400
779 #define MI_PREDICATE_SRC1    0x2408
780 #define MI_PREDICATE_RESULT  0x2418
781 
782 static void
set_image_compressed_bit(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t base_layer,uint32_t layer_count,bool compressed)783 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
784                          const struct anv_image *image,
785                          VkImageAspectFlagBits aspect,
786                          uint32_t level,
787                          uint32_t base_layer, uint32_t layer_count,
788                          bool compressed)
789 {
790    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
791 
792    /* We only have compression tracking for CCS_E */
793    if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E)
794       return;
795 
796    for (uint32_t a = 0; a < layer_count; a++) {
797       uint32_t layer = base_layer + a;
798       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
799          sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device,
800                                                             image, aspect,
801                                                             level, layer);
802          sdi.ImmediateData = compressed ? UINT32_MAX : 0;
803       }
804    }
805 }
806 
807 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)808 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
809                            const struct anv_image *image,
810                            VkImageAspectFlagBits aspect,
811                            enum anv_fast_clear_type fast_clear)
812 {
813    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
814       sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
815                                                        image, aspect);
816       sdi.ImmediateData = fast_clear;
817    }
818 
819    /* Whenever we have fast-clear, we consider that slice to be compressed.
820     * This makes building predicates much easier.
821     */
822    if (fast_clear != ANV_FAST_CLEAR_NONE)
823       set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
824 }
825 
826 /* This is only really practical on haswell and above because it requires
827  * MI math in order to get it correct.
828  */
829 #if GFX_VERx10 >= 75
830 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)831 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
832                                   const struct anv_image *image,
833                                   VkImageAspectFlagBits aspect,
834                                   uint32_t level, uint32_t array_layer,
835                                   enum isl_aux_op resolve_op,
836                                   enum anv_fast_clear_type fast_clear_supported)
837 {
838    struct mi_builder b;
839    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
840 
841    const struct mi_value fast_clear_type =
842       mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
843                                                   image, aspect));
844 
845    if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
846       /* In this case, we're doing a full resolve which means we want the
847        * resolve to happen if any compression (including fast-clears) is
848        * present.
849        *
850        * In order to simplify the logic a bit, we make the assumption that,
851        * if the first slice has been fast-cleared, it is also marked as
852        * compressed.  See also set_image_fast_clear_state.
853        */
854       const struct mi_value compression_state =
855          mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device,
856                                                        image, aspect,
857                                                        level, array_layer));
858       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
859       mi_store(&b, compression_state, mi_imm(0));
860 
861       if (level == 0 && array_layer == 0) {
862          /* If the predicate is true, we want to write 0 to the fast clear type
863           * and, if it's false, leave it alone.  We can do this by writing
864           *
865           * clear_type = clear_type & ~predicate;
866           */
867          struct mi_value new_fast_clear_type =
868             mi_iand(&b, fast_clear_type,
869                         mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
870          mi_store(&b, fast_clear_type, new_fast_clear_type);
871       }
872    } else if (level == 0 && array_layer == 0) {
873       /* In this case, we are doing a partial resolve to get rid of fast-clear
874        * colors.  We don't care about the compression state but we do care
875        * about how much fast clear is allowed by the final layout.
876        */
877       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
878       assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
879 
880       /* We need to compute (fast_clear_supported < image->fast_clear) */
881       struct mi_value pred =
882          mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
883       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
884 
885       /* If the predicate is true, we want to write 0 to the fast clear type
886        * and, if it's false, leave it alone.  We can do this by writing
887        *
888        * clear_type = clear_type & ~predicate;
889        */
890       struct mi_value new_fast_clear_type =
891          mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
892       mi_store(&b, fast_clear_type, new_fast_clear_type);
893    } else {
894       /* In this case, we're trying to do a partial resolve on a slice that
895        * doesn't have clear color.  There's nothing to do.
896        */
897       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
898       return;
899    }
900 
901    /* Set src1 to 0 and use a != condition */
902    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
903 
904    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
905       mip.LoadOperation    = LOAD_LOADINV;
906       mip.CombineOperation = COMBINE_SET;
907       mip.CompareOperation = COMPARE_SRCS_EQUAL;
908    }
909 }
910 #endif /* GFX_VERx10 >= 75 */
911 
912 #if GFX_VER <= 8
913 static void
anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)914 anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
915                                  const struct anv_image *image,
916                                  VkImageAspectFlagBits aspect,
917                                  uint32_t level, uint32_t array_layer,
918                                  enum isl_aux_op resolve_op,
919                                  enum anv_fast_clear_type fast_clear_supported)
920 {
921    struct mi_builder b;
922    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
923 
924    struct mi_value fast_clear_type_mem =
925       mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
926                                                       image, aspect));
927 
928    /* This only works for partial resolves and only when the clear color is
929     * all or nothing.  On the upside, this emits less command streamer code
930     * and works on Ivybridge and Bay Trail.
931     */
932    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
933    assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
934 
935    /* We don't support fast clears on anything other than the first slice. */
936    if (level > 0 || array_layer > 0)
937       return;
938 
939    /* On gfx8, we don't have a concept of default clear colors because we
940     * can't sample from CCS surfaces.  It's enough to just load the fast clear
941     * state into the predicate register.
942     */
943    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
944    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
945    mi_store(&b, fast_clear_type_mem, mi_imm(0));
946 
947    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
948       mip.LoadOperation    = LOAD_LOADINV;
949       mip.CombineOperation = COMBINE_SET;
950       mip.CompareOperation = COMPARE_SRCS_EQUAL;
951    }
952 }
953 #endif /* GFX_VER <= 8 */
954 
955 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)956 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
957                                const struct anv_image *image,
958                                enum isl_format format,
959                                struct isl_swizzle swizzle,
960                                VkImageAspectFlagBits aspect,
961                                uint32_t level, uint32_t array_layer,
962                                enum isl_aux_op resolve_op,
963                                enum anv_fast_clear_type fast_clear_supported)
964 {
965    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
966 
967 #if GFX_VER >= 9
968    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
969                                      aspect, level, array_layer,
970                                      resolve_op, fast_clear_supported);
971 #else /* GFX_VER <= 8 */
972    anv_cmd_simple_resolve_predicate(cmd_buffer, image,
973                                     aspect, level, array_layer,
974                                     resolve_op, fast_clear_supported);
975 #endif
976 
977    /* CCS_D only supports full resolves and BLORP will assert on us if we try
978     * to do a partial resolve on a CCS_D surface.
979     */
980    if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
981        image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
982       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
983 
984    anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
985                     level, array_layer, 1, resolve_op, NULL, true);
986 }
987 
988 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)989 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
990                                const struct anv_image *image,
991                                enum isl_format format,
992                                struct isl_swizzle swizzle,
993                                VkImageAspectFlagBits aspect,
994                                uint32_t array_layer,
995                                enum isl_aux_op resolve_op,
996                                enum anv_fast_clear_type fast_clear_supported)
997 {
998    assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
999    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
1000 
1001 #if GFX_VERx10 >= 75
1002    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
1003                                      aspect, 0, array_layer,
1004                                      resolve_op, fast_clear_supported);
1005 
1006    anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
1007                     array_layer, 1, resolve_op, NULL, true);
1008 #else
1009    unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
1010 #endif
1011 }
1012 
1013 void
genX(cmd_buffer_mark_image_written)1014 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
1015                                     const struct anv_image *image,
1016                                     VkImageAspectFlagBits aspect,
1017                                     enum isl_aux_usage aux_usage,
1018                                     uint32_t level,
1019                                     uint32_t base_layer,
1020                                     uint32_t layer_count)
1021 {
1022    /* The aspect must be exactly one of the image aspects. */
1023    assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
1024 
1025    /* The only compression types with more than just fast-clears are MCS,
1026     * CCS_E, and HiZ.  With HiZ we just trust the layout and don't actually
1027     * track the current fast-clear and compression state.  This leaves us
1028     * with just MCS and CCS_E.
1029     */
1030    if (aux_usage != ISL_AUX_USAGE_CCS_E &&
1031        aux_usage != ISL_AUX_USAGE_MCS)
1032       return;
1033 
1034    set_image_compressed_bit(cmd_buffer, image, aspect,
1035                             level, base_layer, layer_count, true);
1036 }
1037 
1038 static void
init_fast_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect)1039 init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
1040                       const struct anv_image *image,
1041                       VkImageAspectFlagBits aspect)
1042 {
1043    assert(cmd_buffer && image);
1044    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1045 
1046    set_image_fast_clear_state(cmd_buffer, image, aspect,
1047                               ANV_FAST_CLEAR_NONE);
1048 
1049    /* Initialize the struct fields that are accessed for fast-clears so that
1050     * the HW restrictions on the field values are satisfied.
1051     */
1052    struct anv_address addr =
1053       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
1054 
1055    if (GFX_VER >= 9) {
1056       const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1057       const unsigned num_dwords = GFX_VER >= 10 ?
1058                                   isl_dev->ss.clear_color_state_size / 4 :
1059                                   isl_dev->ss.clear_value_size / 4;
1060       for (unsigned i = 0; i < num_dwords; i++) {
1061          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
1062             sdi.Address = addr;
1063             sdi.Address.offset += i * 4;
1064             sdi.ImmediateData = 0;
1065          }
1066       }
1067    } else {
1068       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
1069          sdi.Address = addr;
1070          if (GFX_VERx10 >= 75) {
1071             /* Pre-SKL, the dword containing the clear values also contains
1072              * other fields, so we need to initialize those fields to match the
1073              * values that would be in a color attachment.
1074              */
1075             sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
1076                                 ISL_CHANNEL_SELECT_GREEN << 22 |
1077                                 ISL_CHANNEL_SELECT_BLUE  << 19 |
1078                                 ISL_CHANNEL_SELECT_ALPHA << 16;
1079          } else if (GFX_VER == 7) {
1080             /* On IVB, the dword containing the clear values also contains
1081              * other fields that must be zero or can be zero.
1082              */
1083             sdi.ImmediateData = 0;
1084          }
1085       }
1086    }
1087 }
1088 
1089 /* Copy the fast-clear value dword(s) between a surface state object and an
1090  * image's fast clear state buffer.
1091  */
1092 static void
genX(copy_fast_clear_dwords)1093 genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
1094                              struct anv_state surface_state,
1095                              const struct anv_image *image,
1096                              VkImageAspectFlagBits aspect,
1097                              bool copy_from_surface_state)
1098 {
1099    assert(cmd_buffer && image);
1100    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1101 
1102    struct anv_address ss_clear_addr = {
1103       .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
1104       .offset = surface_state.offset +
1105                 cmd_buffer->device->isl_dev.ss.clear_value_offset,
1106    };
1107    const struct anv_address entry_addr =
1108       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
1109    unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
1110 
1111 #if GFX_VER == 7
1112    /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
1113     * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
1114     * in-flight when they are issued even if the memory touched is not
1115     * currently active for rendering.  The weird bit is that it is not the
1116     * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
1117     * rendering hangs such that the next stalling command after the
1118     * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
1119     *
1120     * It is unclear exactly why this hang occurs.  Both MI commands come with
1121     * warnings about the 3D pipeline but that doesn't seem to fully explain
1122     * it.  My (Jason's) best theory is that it has something to do with the
1123     * fact that we're using a GPU state register as our temporary and that
1124     * something with reading/writing it is causing problems.
1125     *
1126     * In order to work around this issue, we emit a PIPE_CONTROL with the
1127     * command streamer stall bit set.
1128     */
1129    anv_add_pending_pipe_bits(cmd_buffer,
1130                              ANV_PIPE_CS_STALL_BIT,
1131                              "after copy_fast_clear_dwords. Avoid potential hang");
1132    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1133 #endif
1134 
1135    struct mi_builder b;
1136    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1137 
1138    if (copy_from_surface_state) {
1139       mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
1140    } else {
1141       mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
1142 
1143       /* Updating a surface state object may require that the state cache be
1144        * invalidated. From the SKL PRM, Shared Functions -> State -> State
1145        * Caching:
1146        *
1147        *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
1148        *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
1149        *    modified [...], the L1 state cache must be invalidated to ensure
1150        *    the new surface or sampler state is fetched from system memory.
1151        *
1152        * In testing, SKL doesn't actually seem to need this, but HSW does.
1153        */
1154       anv_add_pending_pipe_bits(cmd_buffer,
1155                                 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
1156                                 "after copy_fast_clear_dwords surface state update");
1157    }
1158 }
1159 
1160 /**
1161  * @brief Transitions a color buffer from one layout to another.
1162  *
1163  * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
1164  * more information.
1165  *
1166  * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
1167  * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
1168  *                    this represents the maximum layers to transition at each
1169  *                    specified miplevel.
1170  */
1171 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint64_t src_queue_family,uint64_t dst_queue_family,bool will_full_fast_clear)1172 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
1173                         const struct anv_image *image,
1174                         VkImageAspectFlagBits aspect,
1175                         const uint32_t base_level, uint32_t level_count,
1176                         uint32_t base_layer, uint32_t layer_count,
1177                         VkImageLayout initial_layout,
1178                         VkImageLayout final_layout,
1179                         uint64_t src_queue_family,
1180                         uint64_t dst_queue_family,
1181                         bool will_full_fast_clear)
1182 {
1183    struct anv_device *device = cmd_buffer->device;
1184    const struct intel_device_info *devinfo = &device->info;
1185    /* Validate the inputs. */
1186    assert(cmd_buffer);
1187    assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1188    /* These values aren't supported for simplicity's sake. */
1189    assert(level_count != VK_REMAINING_MIP_LEVELS &&
1190           layer_count != VK_REMAINING_ARRAY_LAYERS);
1191    /* Ensure the subresource range is valid. */
1192    UNUSED uint64_t last_level_num = base_level + level_count;
1193    const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level);
1194    UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
1195    assert((uint64_t)base_layer + layer_count  <= image_layers);
1196    assert(last_level_num <= image->vk.mip_levels);
1197    /* If there is a layout transfer, the final layout cannot be undefined or
1198     * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
1199     */
1200    assert(initial_layout == final_layout ||
1201           (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
1202            final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
1203    const struct isl_drm_modifier_info *isl_mod_info =
1204       image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
1205       ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
1206       : NULL;
1207 
1208    const bool src_queue_external =
1209       src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1210       src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1211 
1212    const bool dst_queue_external =
1213       dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1214       dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1215 
1216    /* Simultaneous acquire and release on external queues is illegal. */
1217    assert(!src_queue_external || !dst_queue_external);
1218 
1219    /* Ownership transition on an external queue requires special action if the
1220     * image has a DRM format modifier because we store image data in
1221     * a driver-private bo which is inaccessible to the external queue.
1222     */
1223    const bool private_binding_acquire =
1224       src_queue_external &&
1225       anv_image_is_externally_shared(image) &&
1226       anv_image_has_private_binding(image);
1227 
1228    const bool private_binding_release =
1229       dst_queue_external &&
1230       anv_image_is_externally_shared(image) &&
1231       anv_image_has_private_binding(image);
1232 
1233    if (initial_layout == final_layout &&
1234        !private_binding_acquire && !private_binding_release) {
1235       /* No work is needed. */
1236        return;
1237    }
1238 
1239    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
1240 
1241    if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
1242        final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
1243       /* This surface is a linear compressed image with a tiled shadow surface
1244        * for texturing.  The client is about to use it in READ_ONLY_OPTIMAL so
1245        * we need to ensure the shadow copy is up-to-date.
1246        */
1247       assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
1248       assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
1249       assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
1250       assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
1251       assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
1252       assert(plane == 0);
1253       anv_image_copy_to_shadow(cmd_buffer, image,
1254                                VK_IMAGE_ASPECT_COLOR_BIT,
1255                                base_level, level_count,
1256                                base_layer, layer_count);
1257    }
1258 
1259    if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
1260       return;
1261 
1262    assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
1263 
1264    /* The following layouts are equivalent for non-linear images. */
1265    const bool initial_layout_undefined =
1266       initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
1267       initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
1268 
1269    bool must_init_fast_clear_state = false;
1270    bool must_init_aux_surface = false;
1271 
1272    if (initial_layout_undefined) {
1273       /* The subresource may have been aliased and populated with arbitrary
1274        * data.
1275        */
1276       must_init_fast_clear_state = true;
1277       must_init_aux_surface = true;
1278    } else if (private_binding_acquire) {
1279       /* The fast clear state lives in a driver-private bo, and therefore the
1280        * external/foreign queue is unaware of it.
1281        *
1282        * If this is the first time we are accessing the image, then the fast
1283        * clear state is uninitialized.
1284        *
1285        * If this is NOT the first time we are accessing the image, then the fast
1286        * clear state may still be valid and correct due to the resolve during
1287        * our most recent ownership release.  However, we do not track the aux
1288        * state with MI stores, and therefore must assume the worst-case: that
1289        * this is the first time we are accessing the image.
1290        */
1291       assert(image->planes[plane].fast_clear_memory_range.binding ==
1292               ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1293       must_init_fast_clear_state = true;
1294 
1295       if (image->planes[plane].aux_surface.memory_range.binding ==
1296           ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
1297          assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE);
1298 
1299          /* The aux surface, like the fast clear state, lives in
1300           * a driver-private bo.  We must initialize the aux surface for the
1301           * same reasons we must initialize the fast clear state.
1302           */
1303          must_init_aux_surface = true;
1304       } else {
1305          assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE);
1306 
1307          /* The aux surface, unlike the fast clear state, lives in
1308           * application-visible VkDeviceMemory and is shared with the
1309           * external/foreign queue. Therefore, when we acquire ownership of the
1310           * image with a defined VkImageLayout, the aux surface is valid and has
1311           * the aux state required by the modifier.
1312           */
1313          must_init_aux_surface = false;
1314       }
1315    }
1316 
1317 #if GFX_VER == 12
1318    if (initial_layout_undefined) {
1319       if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {
1320          anv_image_init_aux_tt(cmd_buffer, image, aspect,
1321                                base_level, level_count,
1322                                base_layer, layer_count);
1323       }
1324    }
1325 #else
1326    assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
1327 #endif
1328 
1329    if (must_init_fast_clear_state) {
1330       if (base_level == 0 && base_layer == 0)
1331          init_fast_clear_color(cmd_buffer, image, aspect);
1332    }
1333 
1334    if (must_init_aux_surface) {
1335       assert(must_init_fast_clear_state);
1336 
1337       /* Initialize the aux buffers to enable correct rendering.  In order to
1338        * ensure that things such as storage images work correctly, aux buffers
1339        * need to be initialized to valid data.
1340        *
1341        * Having an aux buffer with invalid data is a problem for two reasons:
1342        *
1343        *  1) Having an invalid value in the buffer can confuse the hardware.
1344        *     For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1345        *     invalid and leads to the hardware doing strange things.  It
1346        *     doesn't hang as far as we can tell but rendering corruption can
1347        *     occur.
1348        *
1349        *  2) If this transition is into the GENERAL layout and we then use the
1350        *     image as a storage image, then we must have the aux buffer in the
1351        *     pass-through state so that, if we then go to texture from the
1352        *     image, we get the results of our storage image writes and not the
1353        *     fast clear color or other random data.
1354        *
1355        * For CCS both of the problems above are real demonstrable issues.  In
1356        * that case, the only thing we can do is to perform an ambiguate to
1357        * transition the aux surface into the pass-through state.
1358        *
1359        * For MCS, (2) is never an issue because we don't support multisampled
1360        * storage images.  In theory, issue (1) is a problem with MCS but we've
1361        * never seen it in the wild.  For 4x and 16x, all bit patters could, in
1362        * theory, be interpreted as something but we don't know that all bit
1363        * patterns are actually valid.  For 2x and 8x, you could easily end up
1364        * with the MCS referring to an invalid plane because not all bits of
1365        * the MCS value are actually used.  Even though we've never seen issues
1366        * in the wild, it's best to play it safe and initialize the MCS.  We
1367        * can use a fast-clear for MCS because we only ever touch from render
1368        * and texture (no image load store).
1369        */
1370       if (image->vk.samples == 1) {
1371          for (uint32_t l = 0; l < level_count; l++) {
1372             const uint32_t level = base_level + l;
1373 
1374             uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1375             if (base_layer >= aux_layers)
1376                break; /* We will only get fewer layers as level increases */
1377             uint32_t level_layer_count =
1378                MIN2(layer_count, aux_layers - base_layer);
1379 
1380             /* If will_full_fast_clear is set, the caller promises to
1381              * fast-clear the largest portion of the specified range as it can.
1382              * For color images, that means only the first LOD and array slice.
1383              */
1384             if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1385                base_layer++;
1386                level_layer_count--;
1387                if (level_layer_count == 0)
1388                   continue;
1389             }
1390 
1391             anv_image_ccs_op(cmd_buffer, image,
1392                              image->planes[plane].primary_surface.isl.format,
1393                              ISL_SWIZZLE_IDENTITY,
1394                              aspect, level, base_layer, level_layer_count,
1395                              ISL_AUX_OP_AMBIGUATE, NULL, false);
1396 
1397             if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
1398                set_image_compressed_bit(cmd_buffer, image, aspect,
1399                                         level, base_layer, level_layer_count,
1400                                         false);
1401             }
1402          }
1403       } else {
1404          if (image->vk.samples == 4 || image->vk.samples == 16) {
1405             anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
1406                           "Doing a potentially unnecessary fast-clear to "
1407                           "define an MCS buffer.");
1408          }
1409 
1410          /* If will_full_fast_clear is set, the caller promises to fast-clear
1411           * the largest portion of the specified range as it can.
1412           */
1413          if (will_full_fast_clear)
1414             return;
1415 
1416          assert(base_level == 0 && level_count == 1);
1417          anv_image_mcs_op(cmd_buffer, image,
1418                           image->planes[plane].primary_surface.isl.format,
1419                           ISL_SWIZZLE_IDENTITY,
1420                           aspect, base_layer, layer_count,
1421                           ISL_AUX_OP_FAST_CLEAR, NULL, false);
1422       }
1423       return;
1424    }
1425 
1426    enum isl_aux_usage initial_aux_usage =
1427       anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
1428    enum isl_aux_usage final_aux_usage =
1429       anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
1430    enum anv_fast_clear_type initial_fast_clear =
1431       anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
1432    enum anv_fast_clear_type final_fast_clear =
1433       anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
1434 
1435    /* We must override the anv_layout_to_* functions because they are unaware of
1436     * acquire/release direction.
1437     */
1438    if (private_binding_acquire) {
1439       initial_aux_usage = isl_mod_info->aux_usage;
1440       initial_fast_clear = isl_mod_info->supports_clear_color ?
1441          initial_fast_clear : ANV_FAST_CLEAR_NONE;
1442    } else if (private_binding_release) {
1443       final_aux_usage = isl_mod_info->aux_usage;
1444       final_fast_clear = isl_mod_info->supports_clear_color ?
1445          final_fast_clear : ANV_FAST_CLEAR_NONE;
1446    }
1447 
1448    /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1449     * We can handle transitions between CCS_D/E to and from NONE.  What we
1450     * don't yet handle is switching between CCS_E and CCS_D within a given
1451     * image.  Doing so in a performant way requires more detailed aux state
1452     * tracking such as what is done in i965.  For now, just assume that we
1453     * only have one type of compression.
1454     */
1455    assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1456           final_aux_usage == ISL_AUX_USAGE_NONE ||
1457           initial_aux_usage == final_aux_usage);
1458 
1459    /* If initial aux usage is NONE, there is nothing to resolve */
1460    if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1461       return;
1462 
1463    enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1464 
1465    /* If the initial layout supports more fast clear than the final layout
1466     * then we need at least a partial resolve.
1467     */
1468    if (final_fast_clear < initial_fast_clear)
1469       resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1470 
1471    if (initial_aux_usage == ISL_AUX_USAGE_CCS_E &&
1472        final_aux_usage != ISL_AUX_USAGE_CCS_E)
1473       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
1474 
1475    if (resolve_op == ISL_AUX_OP_NONE)
1476       return;
1477 
1478    /* Perform a resolve to synchronize data between the main and aux buffer.
1479     * Before we begin, we must satisfy the cache flushing requirement specified
1480     * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1481     *
1482     *    Any transition from any value in {Clear, Render, Resolve} to a
1483     *    different value in {Clear, Render, Resolve} requires end of pipe
1484     *    synchronization.
1485     *
1486     * We perform a flush of the write cache before and after the clear and
1487     * resolve operations to meet this requirement.
1488     *
1489     * Unlike other drawing, fast clear operations are not properly
1490     * synchronized. The first PIPE_CONTROL here likely ensures that the
1491     * contents of the previous render or clear hit the render target before we
1492     * resolve and the second likely ensures that the resolve is complete before
1493     * we do any more rendering or clearing.
1494     */
1495    anv_add_pending_pipe_bits(cmd_buffer,
1496                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1497                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1498                              "after transition RT");
1499 
1500    for (uint32_t l = 0; l < level_count; l++) {
1501       uint32_t level = base_level + l;
1502 
1503       uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1504       if (base_layer >= aux_layers)
1505          break; /* We will only get fewer layers as level increases */
1506       uint32_t level_layer_count =
1507          MIN2(layer_count, aux_layers - base_layer);
1508 
1509       for (uint32_t a = 0; a < level_layer_count; a++) {
1510          uint32_t array_layer = base_layer + a;
1511 
1512          /* If will_full_fast_clear is set, the caller promises to fast-clear
1513           * the largest portion of the specified range as it can.  For color
1514           * images, that means only the first LOD and array slice.
1515           */
1516          if (level == 0 && array_layer == 0 && will_full_fast_clear)
1517             continue;
1518 
1519          if (image->vk.samples == 1) {
1520             anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1521                                            image->planes[plane].primary_surface.isl.format,
1522                                            ISL_SWIZZLE_IDENTITY,
1523                                            aspect, level, array_layer, resolve_op,
1524                                            final_fast_clear);
1525          } else {
1526             /* We only support fast-clear on the first layer so partial
1527              * resolves should not be used on other layers as they will use
1528              * the clear color stored in memory that is only valid for layer0.
1529              */
1530             if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1531                 array_layer != 0)
1532                continue;
1533 
1534             anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1535                                            image->planes[plane].primary_surface.isl.format,
1536                                            ISL_SWIZZLE_IDENTITY,
1537                                            aspect, array_layer, resolve_op,
1538                                            final_fast_clear);
1539          }
1540       }
1541    }
1542 
1543    anv_add_pending_pipe_bits(cmd_buffer,
1544                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1545                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1546                              "after transition RT");
1547 }
1548 
1549 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count,uint32_t color_att_valid)1550 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1551                                 uint32_t color_att_count,
1552                                 uint32_t color_att_valid)
1553 {
1554    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1555 
1556    /* Reserve one for the NULL state. */
1557    unsigned num_states = 1 + util_bitcount(color_att_valid);
1558    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1559    const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
1560    gfx->att_states =
1561       anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
1562                              num_states * ss_stride, isl_dev->ss.align);
1563    if (gfx->att_states.map == NULL) {
1564       return anv_batch_set_error(&cmd_buffer->batch,
1565                                  VK_ERROR_OUT_OF_DEVICE_MEMORY);
1566    }
1567 
1568    struct anv_state next_state = gfx->att_states;
1569    next_state.alloc_size = isl_dev->ss.size;
1570 
1571    gfx->null_surface_state = next_state;
1572    next_state.offset += ss_stride;
1573    next_state.map += ss_stride;
1574 
1575    gfx->color_att_count = color_att_count;
1576    for (uint32_t i = 0; i < color_att_count; i++) {
1577       if (color_att_valid & BITFIELD_BIT(i)) {
1578          gfx->color_att[i] = (struct anv_attachment) {
1579             .surface_state.state = next_state,
1580          };
1581          next_state.offset += ss_stride;
1582          next_state.map += ss_stride;
1583       } else {
1584          gfx->color_att[i] = (struct anv_attachment) {
1585             .surface_state.state = gfx->null_surface_state,
1586          };
1587       }
1588    }
1589    gfx->depth_att = (struct anv_attachment) { };
1590    gfx->stencil_att = (struct anv_attachment) { };
1591 
1592    return VK_SUCCESS;
1593 }
1594 
1595 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1596 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1597 {
1598    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1599 
1600    gfx->render_area = (VkRect2D) { };
1601    gfx->layer_count = 0;
1602    gfx->samples = 0;
1603 
1604    gfx->color_att_count = 0;
1605    gfx->depth_att = (struct anv_attachment) { };
1606    gfx->stencil_att = (struct anv_attachment) { };
1607    gfx->null_surface_state = ANV_STATE_NULL;
1608 }
1609 
1610 VkResult
genX(BeginCommandBuffer)1611 genX(BeginCommandBuffer)(
1612     VkCommandBuffer                             commandBuffer,
1613     const VkCommandBufferBeginInfo*             pBeginInfo)
1614 {
1615    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1616    VkResult result;
1617 
1618    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
1619     * command buffer's state. Otherwise, we must *reset* its state. In both
1620     * cases we reset it.
1621     *
1622     * From the Vulkan 1.0 spec:
1623     *
1624     *    If a command buffer is in the executable state and the command buffer
1625     *    was allocated from a command pool with the
1626     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
1627     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
1628     *    as if vkResetCommandBuffer had been called with
1629     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
1630     *    the command buffer in the recording state.
1631     */
1632    anv_cmd_buffer_reset(cmd_buffer);
1633    anv_cmd_buffer_reset_rendering(cmd_buffer);
1634 
1635    cmd_buffer->usage_flags = pBeginInfo->flags;
1636 
1637    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
1638     * primary level command buffers.
1639     *
1640     * From the Vulkan 1.0 spec:
1641     *
1642     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
1643     *    secondary command buffer is considered to be entirely inside a render
1644     *    pass. If this is a primary command buffer, then this bit is ignored.
1645     */
1646    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1647       cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1648 
1649    trace_intel_begin_cmd_buffer(&cmd_buffer->trace, cmd_buffer);
1650 
1651    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1652 
1653    /* We sometimes store vertex data in the dynamic state buffer for blorp
1654     * operations and our dynamic state stream may re-use data from previous
1655     * command buffers.  In order to prevent stale cache data, we flush the VF
1656     * cache.  We could do this on every blorp call but that's not really
1657     * needed as all of the data will get written by the CPU prior to the GPU
1658     * executing anything.  The chances are fairly high that they will use
1659     * blorp at least once per primary command buffer so it shouldn't be
1660     * wasted.
1661     *
1662     * There is also a workaround on gfx8 which requires us to invalidate the
1663     * VF cache occasionally.  It's easier if we can assume we start with a
1664     * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
1665     */
1666    anv_add_pending_pipe_bits(cmd_buffer,
1667                              ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1668                              "new cmd buffer");
1669 
1670    /* Re-emit the aux table register in every command buffer.  This way we're
1671     * ensured that we have the table even if this command buffer doesn't
1672     * initialize any images.
1673     */
1674    if (cmd_buffer->device->info.has_aux_map) {
1675       anv_add_pending_pipe_bits(cmd_buffer,
1676                                 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
1677                                 "new cmd buffer with aux-tt");
1678    }
1679 
1680    /* We send an "Indirect State Pointers Disable" packet at
1681     * EndCommandBuffer, so all push contant packets are ignored during a
1682     * context restore. Documentation says after that command, we need to
1683     * emit push constants again before any rendering operation. So we
1684     * flag them dirty here to make sure they get emitted.
1685     */
1686    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
1687 
1688    if (cmd_buffer->usage_flags &
1689        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1690       struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1691 
1692       const VkCommandBufferInheritanceRenderingInfoKHR *inheritance_info =
1693          vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
1694                                                           pBeginInfo);
1695 
1696       /* We can't get this information from the inheritance info */
1697       gfx->render_area = (VkRect2D) { };
1698       gfx->layer_count = 0;
1699       gfx->samples = 0;
1700       gfx->depth_att = (struct anv_attachment) { };
1701       gfx->stencil_att = (struct anv_attachment) { };
1702 
1703       if (inheritance_info == NULL) {
1704          gfx->rendering_flags = 0;
1705          gfx->view_mask = 0;
1706          gfx->samples = 0;
1707          result = anv_cmd_buffer_init_attachments(cmd_buffer, 0, 0);
1708          if (result != VK_SUCCESS)
1709             return result;
1710       } else {
1711          gfx->rendering_flags = inheritance_info->flags;
1712          gfx->view_mask = inheritance_info->viewMask;
1713          gfx->samples = inheritance_info->rasterizationSamples;
1714 
1715          uint32_t color_att_valid = 0;
1716          uint32_t color_att_count = inheritance_info->colorAttachmentCount;
1717          for (uint32_t i = 0; i < color_att_count; i++) {
1718             VkFormat format = inheritance_info->pColorAttachmentFormats[i];
1719             if (format != VK_FORMAT_UNDEFINED)
1720                color_att_valid |= BITFIELD_BIT(i);
1721          }
1722          result = anv_cmd_buffer_init_attachments(cmd_buffer,
1723                                                   color_att_count,
1724                                                   color_att_valid);
1725          if (result != VK_SUCCESS)
1726             return result;
1727 
1728          for (uint32_t i = 0; i < color_att_count; i++) {
1729             gfx->color_att[i].vk_format =
1730                inheritance_info->pColorAttachmentFormats[i];
1731          }
1732          gfx->depth_att.vk_format =
1733             inheritance_info->depthAttachmentFormat;
1734          gfx->stencil_att.vk_format =
1735             inheritance_info->stencilAttachmentFormat;
1736       }
1737 
1738       /* Try to figure out the depth buffer if we can */
1739       if (pBeginInfo->pInheritanceInfo->renderPass != VK_NULL_HANDLE &&
1740           pBeginInfo->pInheritanceInfo->framebuffer != VK_NULL_HANDLE) {
1741          VK_FROM_HANDLE(vk_render_pass, pass,
1742                         pBeginInfo->pInheritanceInfo->renderPass);
1743          VK_FROM_HANDLE(vk_framebuffer, fb,
1744                         pBeginInfo->pInheritanceInfo->framebuffer);
1745          const struct vk_subpass *subpass =
1746             &pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
1747 
1748          if (!(fb->flags & VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT_KHR) &&
1749              subpass->depth_stencil_attachment != NULL) {
1750             const struct vk_subpass_attachment *att =
1751                subpass->depth_stencil_attachment;
1752 
1753             assert(att->attachment < fb->attachment_count);
1754             ANV_FROM_HANDLE(anv_image_view, iview,
1755                             fb->attachments[att->attachment]);
1756 
1757             if (iview->vk.image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
1758                assert(gfx->depth_att.vk_format == iview->vk.format);
1759                gfx->depth_att.iview = iview;
1760                gfx->depth_att.layout = att->layout;
1761                gfx->depth_att.aux_usage =
1762                   anv_layout_to_aux_usage(&cmd_buffer->device->info,
1763                                           iview->image,
1764                                           VK_IMAGE_ASPECT_DEPTH_BIT,
1765                                           VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
1766                                           att->layout);
1767             }
1768 
1769             if (iview->vk.image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
1770                assert(gfx->stencil_att.vk_format == iview->vk.format);
1771                gfx->stencil_att.iview = iview;
1772                gfx->stencil_att.layout = att->stencil_layout;
1773                gfx->stencil_att.aux_usage =
1774                   anv_layout_to_aux_usage(&cmd_buffer->device->info,
1775                                           iview->image,
1776                                           VK_IMAGE_ASPECT_STENCIL_BIT,
1777                                           VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
1778                                           att->stencil_layout);
1779             }
1780          }
1781       }
1782 
1783       if (gfx->depth_att.iview != NULL) {
1784          cmd_buffer->state.hiz_enabled =
1785             isl_aux_usage_has_hiz(gfx->depth_att.aux_usage);
1786       }
1787 
1788       cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1789    }
1790 
1791 #if GFX_VERx10 >= 75
1792    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1793       const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
1794          vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
1795 
1796       /* If secondary buffer supports conditional rendering
1797        * we should emit commands as if conditional rendering is enabled.
1798        */
1799       cmd_buffer->state.conditional_render_enabled =
1800          conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
1801    }
1802 #endif
1803 
1804    return VK_SUCCESS;
1805 }
1806 
1807 /* From the PRM, Volume 2a:
1808  *
1809  *    "Indirect State Pointers Disable
1810  *
1811  *    At the completion of the post-sync operation associated with this pipe
1812  *    control packet, the indirect state pointers in the hardware are
1813  *    considered invalid; the indirect pointers are not saved in the context.
1814  *    If any new indirect state commands are executed in the command stream
1815  *    while the pipe control is pending, the new indirect state commands are
1816  *    preserved.
1817  *
1818  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
1819  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
1820  *    commands are only considered as Indirect State Pointers. Once ISP is
1821  *    issued in a context, SW must initialize by programming push constant
1822  *    commands for all the shaders (at least to zero length) before attempting
1823  *    any rendering operation for the same context."
1824  *
1825  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
1826  * even though they point to a BO that has been already unreferenced at
1827  * the end of the previous batch buffer. This has been fine so far since
1828  * we are protected by these scratch page (every address not covered by
1829  * a BO should be pointing to the scratch page). But on CNL, it is
1830  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
1831  * instruction.
1832  *
1833  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
1834  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
1835  * context restore, so the mentioned hang doesn't happen. However,
1836  * software must program push constant commands for all stages prior to
1837  * rendering anything. So we flag them dirty in BeginCommandBuffer.
1838  *
1839  * Finally, we also make sure to stall at pixel scoreboard to make sure the
1840  * constants have been loaded into the EUs prior to disable the push constants
1841  * so that it doesn't hang a previous 3DPRIMITIVE.
1842  */
1843 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)1844 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
1845 {
1846    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1847          pc.StallAtPixelScoreboard = true;
1848          pc.CommandStreamerStallEnable = true;
1849          anv_debug_dump_pc(pc);
1850    }
1851    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1852          pc.IndirectStatePointersDisable = true;
1853          pc.CommandStreamerStallEnable = true;
1854          anv_debug_dump_pc(pc);
1855    }
1856 }
1857 
1858 VkResult
genX(EndCommandBuffer)1859 genX(EndCommandBuffer)(
1860     VkCommandBuffer                             commandBuffer)
1861 {
1862    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1863 
1864    if (anv_batch_has_error(&cmd_buffer->batch))
1865       return cmd_buffer->batch.status;
1866 
1867    anv_measure_endcommandbuffer(cmd_buffer);
1868 
1869    /* We want every command buffer to start with the PMA fix in a known state,
1870     * so we disable it at the end of the command buffer.
1871     */
1872    genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
1873 
1874    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1875 
1876    emit_isp_disable(cmd_buffer);
1877 
1878    trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer,
1879                               cmd_buffer->vk.level);
1880 
1881    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
1882 
1883    return VK_SUCCESS;
1884 }
1885 
1886 void
genX(CmdExecuteCommands)1887 genX(CmdExecuteCommands)(
1888     VkCommandBuffer                             commandBuffer,
1889     uint32_t                                    commandBufferCount,
1890     const VkCommandBuffer*                      pCmdBuffers)
1891 {
1892    ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
1893 
1894    assert(primary->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1895 
1896    if (anv_batch_has_error(&primary->batch))
1897       return;
1898 
1899    /* The secondary command buffers will assume that the PMA fix is disabled
1900     * when they begin executing.  Make sure this is true.
1901     */
1902    genX(cmd_buffer_enable_pma_fix)(primary, false);
1903 
1904    /* The secondary command buffer doesn't know which textures etc. have been
1905     * flushed prior to their execution.  Apply those flushes now.
1906     */
1907    genX(cmd_buffer_apply_pipe_flushes)(primary);
1908 
1909    for (uint32_t i = 0; i < commandBufferCount; i++) {
1910       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
1911 
1912       assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1913       assert(!anv_batch_has_error(&secondary->batch));
1914 
1915 #if GFX_VERx10 >= 75
1916       if (secondary->state.conditional_render_enabled) {
1917          if (!primary->state.conditional_render_enabled) {
1918             /* Secondary buffer is constructed as if it will be executed
1919              * with conditional rendering, we should satisfy this dependency
1920              * regardless of conditional rendering being enabled in primary.
1921              */
1922             struct mi_builder b;
1923             mi_builder_init(&b, &primary->device->info, &primary->batch);
1924             mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
1925                          mi_imm(UINT64_MAX));
1926          }
1927       }
1928 #endif
1929 
1930       if (secondary->usage_flags &
1931           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1932          /* If we're continuing a render pass from the primary, we need to
1933           * copy the surface states for the current subpass into the storage
1934           * we allocated for them in BeginCommandBuffer.
1935           */
1936          struct anv_bo *ss_bo =
1937             primary->device->surface_state_pool.block_pool.bo;
1938          struct anv_state src_state = primary->state.gfx.att_states;
1939          struct anv_state dst_state = secondary->state.gfx.att_states;
1940          assert(src_state.alloc_size == dst_state.alloc_size);
1941 
1942          genX(cmd_buffer_so_memcpy)(primary,
1943                                     (struct anv_address) {
1944                                        .bo = ss_bo,
1945                                        .offset = dst_state.offset,
1946                                     },
1947                                     (struct anv_address) {
1948                                        .bo = ss_bo,
1949                                        .offset = src_state.offset,
1950                                     },
1951                                     src_state.alloc_size);
1952       }
1953 
1954       anv_cmd_buffer_add_secondary(primary, secondary);
1955 
1956       assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
1957              secondary->perf_query_pool == primary->perf_query_pool);
1958       if (secondary->perf_query_pool)
1959          primary->perf_query_pool = secondary->perf_query_pool;
1960 
1961 #if GFX_VERx10 == 120
1962       if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
1963          primary->state.depth_reg_mode = secondary->state.depth_reg_mode;
1964 #endif
1965    }
1966 
1967    /* The secondary isn't counted in our VF cache tracking so we need to
1968     * invalidate the whole thing.
1969     */
1970    if (GFX_VER >= 8 && GFX_VER <= 9) {
1971       anv_add_pending_pipe_bits(primary,
1972                                 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1973                                 "Secondary cmd buffer not tracked in VF cache");
1974    }
1975 
1976    /* The secondary may have selected a different pipeline (3D or compute) and
1977     * may have changed the current L3$ configuration.  Reset our tracking
1978     * variables to invalid values to ensure that we re-emit these in the case
1979     * where we do any draws or compute dispatches from the primary after the
1980     * secondary has returned.
1981     */
1982    primary->state.current_pipeline = UINT32_MAX;
1983    primary->state.current_l3_config = NULL;
1984    primary->state.current_hash_scale = 0;
1985 
1986    /* Each of the secondary command buffers will use its own state base
1987     * address.  We need to re-emit state base address for the primary after
1988     * all of the secondaries are done.
1989     *
1990     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
1991     * address calls?
1992     */
1993    genX(cmd_buffer_emit_state_base_address)(primary);
1994 }
1995 
1996 /**
1997  * Program the hardware to use the specified L3 configuration.
1998  */
1999 void
genX(cmd_buffer_config_l3)2000 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
2001                            const struct intel_l3_config *cfg)
2002 {
2003    assert(cfg || GFX_VER >= 12);
2004    if (cfg == cmd_buffer->state.current_l3_config)
2005       return;
2006 
2007 #if GFX_VER >= 11
2008    /* On Gfx11+ we use only one config, so verify it remains the same and skip
2009     * the stalling programming entirely.
2010     */
2011    assert(cfg == cmd_buffer->device->l3_config);
2012 #else
2013    if (INTEL_DEBUG(DEBUG_L3)) {
2014       mesa_logd("L3 config transition: ");
2015       intel_dump_l3_config(cfg, stderr);
2016    }
2017 
2018    /* According to the hardware docs, the L3 partitioning can only be changed
2019     * while the pipeline is completely drained and the caches are flushed,
2020     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
2021     */
2022    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2023       pc.DCFlushEnable = true;
2024       pc.PostSyncOperation = NoWrite;
2025       pc.CommandStreamerStallEnable = true;
2026       anv_debug_dump_pc(pc);
2027    }
2028 
2029    /* ...followed by a second pipelined PIPE_CONTROL that initiates
2030     * invalidation of the relevant caches.  Note that because RO invalidation
2031     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
2032     * command is processed by the CS) we cannot combine it with the previous
2033     * stalling flush as the hardware documentation suggests, because that
2034     * would cause the CS to stall on previous rendering *after* RO
2035     * invalidation and wouldn't prevent the RO caches from being polluted by
2036     * concurrent rendering before the stall completes.  This intentionally
2037     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
2038     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
2039     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
2040     * already guarantee that there is no concurrent GPGPU kernel execution
2041     * (see SKL HSD 2132585).
2042     */
2043    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2044       pc.TextureCacheInvalidationEnable = true;
2045       pc.ConstantCacheInvalidationEnable = true;
2046       pc.InstructionCacheInvalidateEnable = true;
2047       pc.StateCacheInvalidationEnable = true;
2048       pc.PostSyncOperation = NoWrite;
2049       anv_debug_dump_pc(pc);
2050    }
2051 
2052    /* Now send a third stalling flush to make sure that invalidation is
2053     * complete when the L3 configuration registers are modified.
2054     */
2055    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2056       pc.DCFlushEnable = true;
2057       pc.PostSyncOperation = NoWrite;
2058       pc.CommandStreamerStallEnable = true;
2059       anv_debug_dump_pc(pc);
2060    }
2061 
2062    genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
2063 #endif /* GFX_VER >= 11 */
2064    cmd_buffer->state.current_l3_config = cfg;
2065 }
2066 
2067 enum anv_pipe_bits
genX(emit_apply_pipe_flushes)2068 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
2069                               struct anv_device *device,
2070                               uint32_t current_pipeline,
2071                               enum anv_pipe_bits bits)
2072 {
2073    /*
2074     * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
2075     *
2076     *    Write synchronization is a special case of end-of-pipe
2077     *    synchronization that requires that the render cache and/or depth
2078     *    related caches are flushed to memory, where the data will become
2079     *    globally visible. This type of synchronization is required prior to
2080     *    SW (CPU) actually reading the result data from memory, or initiating
2081     *    an operation that will use as a read surface (such as a texture
2082     *    surface) a previous render target and/or depth/stencil buffer
2083     *
2084     *
2085     * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
2086     *
2087     *    Exercising the write cache flush bits (Render Target Cache Flush
2088     *    Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
2089     *    ensures the write caches are flushed and doesn't guarantee the data
2090     *    is globally visible.
2091     *
2092     *    SW can track the completion of the end-of-pipe-synchronization by
2093     *    using "Notify Enable" and "PostSync Operation - Write Immediate
2094     *    Data" in the PIPE_CONTROL command.
2095     *
2096     * In other words, flushes are pipelined while invalidations are handled
2097     * immediately.  Therefore, if we're flushing anything then we need to
2098     * schedule an end-of-pipe sync before any invalidations can happen.
2099     */
2100    if (bits & ANV_PIPE_FLUSH_BITS)
2101       bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2102 
2103 
2104    /* HSD 1209978178: docs say that before programming the aux table:
2105     *
2106     *    "Driver must ensure that the engine is IDLE but ensure it doesn't
2107     *    add extra flushes in the case it knows that the engine is already
2108     *    IDLE."
2109     */
2110    if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT))
2111       bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2112 
2113    /* If we're going to do an invalidate and we have a pending end-of-pipe
2114     * sync that has yet to be resolved, we do the end-of-pipe sync now.
2115     */
2116    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
2117        (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
2118       bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
2119       bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2120    }
2121 
2122    /* Wa_1409226450, Wait for EU to be idle before pipe control which
2123     * invalidates the instruction cache
2124     */
2125    if (GFX_VER == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT))
2126       bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2127 
2128    /* Project: SKL / Argument: LRI Post Sync Operation [23]
2129     *
2130     * "PIPECONTROL command with “Command Streamer Stall Enable” must be
2131     *  programmed prior to programming a PIPECONTROL command with "LRI
2132     *  Post Sync Operation" in GPGPU mode of operation (i.e when
2133     *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
2134     *
2135     * The same text exists a few rows below for Post Sync Op.
2136     */
2137    if (bits & ANV_PIPE_POST_SYNC_BIT) {
2138       if (GFX_VER == 9 && current_pipeline == GPGPU)
2139          bits |= ANV_PIPE_CS_STALL_BIT;
2140       bits &= ~ANV_PIPE_POST_SYNC_BIT;
2141    }
2142 
2143    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
2144                ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
2145       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2146 #if GFX_VER >= 12
2147          pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
2148          pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2149 #else
2150          /* Flushing HDC pipeline requires DC Flush on earlier HW. */
2151          pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2152 #endif
2153          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
2154          pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2155          pipe.RenderTargetCacheFlushEnable =
2156             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
2157 
2158          /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
2159           * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
2160           */
2161 #if GFX_VER >= 12
2162          pipe.DepthStallEnable =
2163             pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT);
2164 #else
2165          pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
2166 #endif
2167 
2168 #if GFX_VERx10 >= 125
2169          pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
2170 #endif
2171 
2172          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
2173          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2174 
2175          /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
2176           *
2177           *    "The most common action to perform upon reaching a
2178           *    synchronization point is to write a value out to memory. An
2179           *    immediate value (included with the synchronization command) may
2180           *    be written."
2181           *
2182           *
2183           * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
2184           *
2185           *    "In case the data flushed out by the render engine is to be
2186           *    read back in to the render engine in coherent manner, then the
2187           *    render engine has to wait for the fence completion before
2188           *    accessing the flushed data. This can be achieved by following
2189           *    means on various products: PIPE_CONTROL command with CS Stall
2190           *    and the required write caches flushed with Post-Sync-Operation
2191           *    as Write Immediate Data.
2192           *
2193           *    Example:
2194           *       - Workload-1 (3D/GPGPU/MEDIA)
2195           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
2196           *         Immediate Data, Required Write Cache Flush bits set)
2197           *       - Workload-2 (Can use the data produce or output by
2198           *         Workload-1)
2199           */
2200          if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
2201             pipe.CommandStreamerStallEnable = true;
2202             pipe.PostSyncOperation = WriteImmediateData;
2203             pipe.Address = device->workaround_address;
2204          }
2205 
2206          /*
2207           * According to the Broadwell documentation, any PIPE_CONTROL with the
2208           * "Command Streamer Stall" bit set must also have another bit set,
2209           * with five different options:
2210           *
2211           *  - Render Target Cache Flush
2212           *  - Depth Cache Flush
2213           *  - Stall at Pixel Scoreboard
2214           *  - Post-Sync Operation
2215           *  - Depth Stall
2216           *  - DC Flush Enable
2217           *
2218           * I chose "Stall at Pixel Scoreboard" since that's what we use in
2219           * mesa and it seems to work fine. The choice is fairly arbitrary.
2220           */
2221          if (pipe.CommandStreamerStallEnable &&
2222              !pipe.RenderTargetCacheFlushEnable &&
2223              !pipe.DepthCacheFlushEnable &&
2224              !pipe.StallAtPixelScoreboard &&
2225              !pipe.PostSyncOperation &&
2226              !pipe.DepthStallEnable &&
2227              !pipe.DCFlushEnable)
2228             pipe.StallAtPixelScoreboard = true;
2229          anv_debug_dump_pc(pipe);
2230       }
2231 
2232       /* If a render target flush was emitted, then we can toggle off the bit
2233        * saying that render target writes are ongoing.
2234        */
2235       if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
2236          bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
2237 
2238       if (GFX_VERx10 == 75) {
2239          /* Haswell needs addition work-arounds:
2240           *
2241           * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
2242           *
2243           *    Option 1:
2244           *    PIPE_CONTROL command with the CS Stall and the required write
2245           *    caches flushed with Post-SyncOperation as Write Immediate Data
2246           *    followed by eight dummy MI_STORE_DATA_IMM (write to scratch
2247           *    spce) commands.
2248           *
2249           *    Example:
2250           *       - Workload-1
2251           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
2252           *         Immediate Data, Required Write Cache Flush bits set)
2253           *       - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
2254           *       - Workload-2 (Can use the data produce or output by
2255           *         Workload-1)
2256           *
2257           * Unfortunately, both the PRMs and the internal docs are a bit
2258           * out-of-date in this regard.  What the windows driver does (and
2259           * this appears to actually work) is to emit a register read from the
2260           * memory address written by the pipe control above.
2261           *
2262           * What register we load into doesn't matter.  We choose an indirect
2263           * rendering register because we know it always exists and it's one
2264           * of the first registers the command parser allows us to write.  If
2265           * you don't have command parser support in your kernel (pre-4.2),
2266           * this will get turned into MI_NOOP and you won't get the
2267           * workaround.  Unfortunately, there's just not much we can do in
2268           * that case.  This register is perfectly safe to write since we
2269           * always re-load all of the indirect draw registers right before
2270           * 3DPRIMITIVE when needed anyway.
2271           */
2272          anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2273             lrm.RegisterAddress  = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
2274             lrm.MemoryAddress = device->workaround_address;
2275          }
2276       }
2277 
2278       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
2279                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2280    }
2281 
2282    if (bits & ANV_PIPE_INVALIDATE_BITS) {
2283       /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2284        *
2285        *    "If the VF Cache Invalidation Enable is set to a 1 in a
2286        *    PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
2287        *    0, with the VF Cache Invalidation Enable set to 0 needs to be sent
2288        *    prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
2289        *    a 1."
2290        *
2291        * This appears to hang Broadwell, so we restrict it to just gfx9.
2292        */
2293       if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
2294          anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
2295 
2296       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2297          pipe.StateCacheInvalidationEnable =
2298             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
2299          pipe.ConstantCacheInvalidationEnable =
2300             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
2301 #if GFX_VER >= 12
2302          /* Invalidates the L3 cache part in which index & vertex data is loaded
2303           * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
2304           */
2305          pipe.L3ReadOnlyCacheInvalidationEnable =
2306             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2307 #endif
2308          pipe.VFCacheInvalidationEnable =
2309             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2310          pipe.TextureCacheInvalidationEnable =
2311             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
2312          pipe.InstructionCacheInvalidateEnable =
2313             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
2314 
2315          /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2316           *
2317           *    "When VF Cache Invalidate is set “Post Sync Operation” must be
2318           *    enabled to “Write Immediate Data” or “Write PS Depth Count” or
2319           *    “Write Timestamp”.
2320           */
2321          if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) {
2322             pipe.PostSyncOperation = WriteImmediateData;
2323             pipe.Address = device->workaround_address;
2324          }
2325          anv_debug_dump_pc(pipe);
2326       }
2327 
2328 #if GFX_VER == 12
2329       if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info.has_aux_map) {
2330          anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2331             lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);
2332             lri.DataDWord = 1;
2333          }
2334       }
2335 #endif
2336 
2337       bits &= ~ANV_PIPE_INVALIDATE_BITS;
2338    }
2339 
2340    return bits;
2341 }
2342 
2343 void
genX(cmd_buffer_apply_pipe_flushes)2344 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
2345 {
2346    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
2347 
2348    if (unlikely(cmd_buffer->device->physical->always_flush_cache))
2349       bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
2350    else if (bits == 0)
2351       return;
2352 
2353    bool trace_flush =
2354       (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS)) != 0;
2355    if (trace_flush)
2356       trace_intel_begin_stall(&cmd_buffer->trace, cmd_buffer);
2357 
2358    if ((GFX_VER >= 8 && GFX_VER <= 9) &&
2359        (bits & ANV_PIPE_CS_STALL_BIT) &&
2360        (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
2361       /* If we are doing a VF cache invalidate AND a CS stall (it must be
2362        * both) then we can reset our vertex cache tracking.
2363        */
2364       memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
2365              sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
2366       memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
2367              sizeof(cmd_buffer->state.gfx.ib_dirty_range));
2368    }
2369 
2370    cmd_buffer->state.pending_pipe_bits =
2371       genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
2372                                     cmd_buffer->device,
2373                                     cmd_buffer->state.current_pipeline,
2374                                     bits);
2375 
2376    if (trace_flush) {
2377       trace_intel_end_stall(&cmd_buffer->trace, cmd_buffer, bits,
2378                             anv_pipe_flush_bit_to_ds_stall_flag, NULL);
2379    }
2380 }
2381 
2382 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfoKHR * dep_info,const char * reason)2383 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
2384                    const VkDependencyInfoKHR *dep_info,
2385                    const char *reason)
2386 {
2387    /* XXX: Right now, we're really dumb and just flush whatever categories
2388     * the app asks for.  One of these days we may make this a bit better
2389     * but right now that's all the hardware allows for in most areas.
2390     */
2391    VkAccessFlags2KHR src_flags = 0;
2392    VkAccessFlags2KHR dst_flags = 0;
2393 
2394    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
2395       src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
2396       dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
2397    }
2398 
2399    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
2400       src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask;
2401       dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask;
2402    }
2403 
2404    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
2405       const VkImageMemoryBarrier2KHR *img_barrier =
2406          &dep_info->pImageMemoryBarriers[i];
2407 
2408       src_flags |= img_barrier->srcAccessMask;
2409       dst_flags |= img_barrier->dstAccessMask;
2410 
2411       ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
2412       const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
2413 
2414       uint32_t base_layer, layer_count;
2415       if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
2416          base_layer = 0;
2417          layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel);
2418       } else {
2419          base_layer = range->baseArrayLayer;
2420          layer_count = vk_image_subresource_layer_count(&image->vk, range);
2421       }
2422       const uint32_t level_count =
2423          vk_image_subresource_level_count(&image->vk, range);
2424 
2425       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2426          transition_depth_buffer(cmd_buffer, image,
2427                                  base_layer, layer_count,
2428                                  img_barrier->oldLayout,
2429                                  img_barrier->newLayout,
2430                                  false /* will_full_fast_clear */);
2431       }
2432 
2433       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2434          transition_stencil_buffer(cmd_buffer, image,
2435                                    range->baseMipLevel, level_count,
2436                                    base_layer, layer_count,
2437                                    img_barrier->oldLayout,
2438                                    img_barrier->newLayout,
2439                                    false /* will_full_fast_clear */);
2440       }
2441 
2442       if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
2443          VkImageAspectFlags color_aspects =
2444             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
2445          anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
2446             transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
2447                                     range->baseMipLevel, level_count,
2448                                     base_layer, layer_count,
2449                                     img_barrier->oldLayout,
2450                                     img_barrier->newLayout,
2451                                     img_barrier->srcQueueFamilyIndex,
2452                                     img_barrier->dstQueueFamilyIndex,
2453                                     false /* will_full_fast_clear */);
2454          }
2455       }
2456    }
2457 
2458    enum anv_pipe_bits bits =
2459       anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
2460       anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
2461 
2462    anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
2463 }
2464 
genX(CmdPipelineBarrier2KHR)2465 void genX(CmdPipelineBarrier2KHR)(
2466     VkCommandBuffer                             commandBuffer,
2467     const VkDependencyInfoKHR*                  pDependencyInfo)
2468 {
2469    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2470 
2471    cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
2472 }
2473 
2474 static void
cmd_buffer_alloc_push_constants(struct anv_cmd_buffer * cmd_buffer)2475 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
2476 {
2477    VkShaderStageFlags stages =
2478       cmd_buffer->state.gfx.pipeline->active_stages;
2479 
2480    /* In order to avoid thrash, we assume that vertex and fragment stages
2481     * always exist.  In the rare case where one is missing *and* the other
2482     * uses push concstants, this may be suboptimal.  However, avoiding stalls
2483     * seems more important.
2484     */
2485    stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
2486    if (anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline))
2487       stages |= VK_SHADER_STAGE_VERTEX_BIT;
2488 
2489    if (stages == cmd_buffer->state.gfx.push_constant_stages)
2490       return;
2491 
2492    const unsigned push_constant_kb =
2493       cmd_buffer->device->info.max_constant_urb_size_kb;
2494 
2495    const unsigned num_stages =
2496       util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
2497    unsigned size_per_stage = push_constant_kb / num_stages;
2498 
2499    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
2500     * units of 2KB.  Incidentally, these are the same platforms that have
2501     * 32KB worth of push constant space.
2502     */
2503    if (push_constant_kb == 32)
2504       size_per_stage &= ~1u;
2505 
2506    uint32_t kb_used = 0;
2507    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
2508       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
2509       anv_batch_emit(&cmd_buffer->batch,
2510                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
2511          alloc._3DCommandSubOpcode  = 18 + i;
2512          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
2513          alloc.ConstantBufferSize   = push_size;
2514       }
2515       kb_used += push_size;
2516    }
2517 
2518    anv_batch_emit(&cmd_buffer->batch,
2519                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
2520       alloc.ConstantBufferOffset = kb_used;
2521       alloc.ConstantBufferSize = push_constant_kb - kb_used;
2522    }
2523 
2524 #if GFX_VERx10 == 125
2525    /* Wa_22011440098
2526     *
2527     * In 3D mode, after programming push constant alloc command immediately
2528     * program push constant command(ZERO length) without any commit between
2529     * them.
2530     */
2531    if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
2532       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
2533          c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
2534       }
2535    }
2536 #endif
2537 
2538    cmd_buffer->state.gfx.push_constant_stages = stages;
2539 
2540    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
2541     *
2542     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
2543     *    the next 3DPRIMITIVE command after programming the
2544     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
2545     *
2546     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
2547     * pipeline setup, we need to dirty push constants.
2548     */
2549    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2550 }
2551 
2552 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)2553 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2554                    struct anv_cmd_pipeline_state *pipe_state,
2555                    struct anv_shader_bin *shader,
2556                    struct anv_state *bt_state)
2557 {
2558    uint32_t state_offset;
2559 
2560    struct anv_pipeline_bind_map *map = &shader->bind_map;
2561    if (map->surface_count == 0) {
2562       *bt_state = (struct anv_state) { 0, };
2563       return VK_SUCCESS;
2564    }
2565 
2566    *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2567                                                   map->surface_count,
2568                                                   &state_offset);
2569    uint32_t *bt_map = bt_state->map;
2570 
2571    if (bt_state->map == NULL)
2572       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2573 
2574    /* We only need to emit relocs if we're not using softpin.  If we are using
2575     * softpin then we always keep all user-allocated memory objects resident.
2576     */
2577    const bool need_client_mem_relocs =
2578       anv_use_relocations(cmd_buffer->device->physical);
2579    struct anv_push_constants *push = &pipe_state->push_constants;
2580 
2581    for (uint32_t s = 0; s < map->surface_count; s++) {
2582       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2583 
2584       struct anv_state surface_state;
2585 
2586       switch (binding->set) {
2587       case ANV_DESCRIPTOR_SET_NULL:
2588          bt_map[s] = 0;
2589          break;
2590 
2591       case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2592          /* Color attachment binding */
2593          assert(shader->stage == MESA_SHADER_FRAGMENT);
2594          if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2595             const struct anv_attachment *att =
2596                &cmd_buffer->state.gfx.color_att[binding->index];
2597             surface_state = att->surface_state.state;
2598          } else {
2599             surface_state = cmd_buffer->state.gfx.null_surface_state;
2600          }
2601          assert(surface_state.map);
2602          bt_map[s] = surface_state.offset + state_offset;
2603          break;
2604 
2605       case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
2606          struct anv_state surface_state =
2607             anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2608 
2609          struct anv_address constant_data = {
2610             .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2611             .offset = shader->kernel.offset +
2612                       shader->prog_data->const_data_offset,
2613          };
2614          unsigned constant_data_size = shader->prog_data->const_data_size;
2615 
2616          const enum isl_format format =
2617             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2618                                                VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2619          anv_fill_buffer_surface_state(cmd_buffer->device,
2620                                        surface_state, format,
2621                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2622                                        constant_data, constant_data_size, 1);
2623 
2624          assert(surface_state.map);
2625          bt_map[s] = surface_state.offset + state_offset;
2626          add_surface_reloc(cmd_buffer, surface_state, constant_data);
2627          break;
2628       }
2629 
2630       case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2631          /* This is always the first binding for compute shaders */
2632          assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2633 
2634          struct anv_state surface_state =
2635             anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2636 
2637          const enum isl_format format =
2638             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2639                                                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2640          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2641                                        format,
2642                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2643                                        cmd_buffer->state.compute.num_workgroups,
2644                                        12, 1);
2645 
2646          assert(surface_state.map);
2647          bt_map[s] = surface_state.offset + state_offset;
2648          if (need_client_mem_relocs) {
2649             add_surface_reloc(cmd_buffer, surface_state,
2650                               cmd_buffer->state.compute.num_workgroups);
2651          }
2652          break;
2653       }
2654 
2655       case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2656          /* This is a descriptor set buffer so the set index is actually
2657           * given by binding->binding.  (Yes, that's confusing.)
2658           */
2659          struct anv_descriptor_set *set =
2660             pipe_state->descriptors[binding->index];
2661          assert(set->desc_mem.alloc_size);
2662          assert(set->desc_surface_state.alloc_size);
2663          bt_map[s] = set->desc_surface_state.offset + state_offset;
2664          add_surface_reloc(cmd_buffer, set->desc_surface_state,
2665                            anv_descriptor_set_address(set));
2666          break;
2667       }
2668 
2669       default: {
2670          assert(binding->set < MAX_SETS);
2671          const struct anv_descriptor_set *set =
2672             pipe_state->descriptors[binding->set];
2673          if (binding->index >= set->descriptor_count) {
2674             /* From the Vulkan spec section entitled "DescriptorSet and
2675              * Binding Assignment":
2676              *
2677              *    "If the array is runtime-sized, then array elements greater
2678              *    than or equal to the size of that binding in the bound
2679              *    descriptor set must not be used."
2680              *
2681              * Unfortunately, the compiler isn't smart enough to figure out
2682              * when a dynamic binding isn't used so it may grab the whole
2683              * array and stick it in the binding table.  In this case, it's
2684              * safe to just skip those bindings that are OOB.
2685              */
2686             assert(binding->index < set->layout->descriptor_count);
2687             continue;
2688          }
2689          const struct anv_descriptor *desc = &set->descriptors[binding->index];
2690 
2691          switch (desc->type) {
2692          case VK_DESCRIPTOR_TYPE_SAMPLER:
2693             /* Nothing for us to do here */
2694             continue;
2695 
2696          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2697          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2698          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
2699             if (desc->image_view) {
2700                struct anv_surface_state sstate =
2701                   (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
2702                   desc->image_view->planes[binding->plane].general_sampler_surface_state :
2703                   desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
2704                surface_state = sstate.state;
2705                assert(surface_state.alloc_size);
2706                if (need_client_mem_relocs)
2707                   add_surface_state_relocs(cmd_buffer, sstate);
2708             } else {
2709                surface_state = cmd_buffer->device->null_surface_state;
2710             }
2711             break;
2712          }
2713 
2714          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
2715             if (desc->image_view) {
2716                struct anv_surface_state sstate =
2717                   binding->lowered_storage_surface
2718                   ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
2719                   : desc->image_view->planes[binding->plane].storage_surface_state;
2720                surface_state = sstate.state;
2721                assert(surface_state.alloc_size);
2722                if (surface_state.offset == 0) {
2723                   mesa_loge("Bound a image to a descriptor where the "
2724                             "descriptor does not have NonReadable "
2725                             "set and the image does not have a "
2726                             "corresponding SPIR-V format enum.");
2727                   vk_debug_report(&cmd_buffer->device->physical->instance->vk,
2728                                   VK_DEBUG_REPORT_ERROR_BIT_EXT,
2729                                   &desc->image_view->vk.base,
2730                                   __LINE__, 0, "anv",
2731                                   "Bound a image to a descriptor where the "
2732                                   "descriptor does not have NonReadable "
2733                                   "set and the image does not have a "
2734                                   "corresponding SPIR-V format enum.");
2735                }
2736                if (surface_state.offset && need_client_mem_relocs)
2737                   add_surface_state_relocs(cmd_buffer, sstate);
2738             } else {
2739                surface_state = cmd_buffer->device->null_surface_state;
2740             }
2741             break;
2742          }
2743 
2744          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2745          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2746             if (desc->set_buffer_view) {
2747                surface_state = desc->set_buffer_view->surface_state;
2748                assert(surface_state.alloc_size);
2749                if (need_client_mem_relocs) {
2750                   add_surface_reloc(cmd_buffer, surface_state,
2751                                     desc->set_buffer_view->address);
2752                }
2753             } else {
2754                surface_state = cmd_buffer->device->null_surface_state;
2755             }
2756             break;
2757 
2758          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2759             if (desc->buffer_view) {
2760                surface_state = desc->buffer_view->surface_state;
2761                assert(surface_state.alloc_size);
2762                if (need_client_mem_relocs) {
2763                   add_surface_reloc(cmd_buffer, surface_state,
2764                                     desc->buffer_view->address);
2765                }
2766             } else {
2767                surface_state = cmd_buffer->device->null_surface_state;
2768             }
2769             break;
2770 
2771          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2772          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2773             if (desc->buffer) {
2774                /* Compute the offset within the buffer */
2775                uint32_t dynamic_offset =
2776                   push->dynamic_offsets[binding->dynamic_offset_index];
2777                uint64_t offset = desc->offset + dynamic_offset;
2778                /* Clamp to the buffer size */
2779                offset = MIN2(offset, desc->buffer->size);
2780                /* Clamp the range to the buffer size */
2781                uint32_t range = MIN2(desc->range, desc->buffer->size - offset);
2782 
2783                /* Align the range for consistency */
2784                if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
2785                   range = align_u32(range, ANV_UBO_ALIGNMENT);
2786 
2787                struct anv_address address =
2788                   anv_address_add(desc->buffer->address, offset);
2789 
2790                surface_state =
2791                   anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
2792                enum isl_format format =
2793                   anv_isl_format_for_descriptor_type(cmd_buffer->device,
2794                                                      desc->type);
2795 
2796                isl_surf_usage_flags_t usage =
2797                   desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
2798                   ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
2799                   ISL_SURF_USAGE_STORAGE_BIT;
2800 
2801                anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2802                                              format, usage, address, range, 1);
2803                if (need_client_mem_relocs)
2804                   add_surface_reloc(cmd_buffer, surface_state, address);
2805             } else {
2806                surface_state = cmd_buffer->device->null_surface_state;
2807             }
2808             break;
2809          }
2810 
2811          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2812             if (desc->buffer_view) {
2813                surface_state = binding->lowered_storage_surface
2814                   ? desc->buffer_view->lowered_storage_surface_state
2815                   : desc->buffer_view->storage_surface_state;
2816                assert(surface_state.alloc_size);
2817                if (need_client_mem_relocs) {
2818                   add_surface_reloc(cmd_buffer, surface_state,
2819                                     desc->buffer_view->address);
2820                }
2821             } else {
2822                surface_state = cmd_buffer->device->null_surface_state;
2823             }
2824             break;
2825 
2826          default:
2827             assert(!"Invalid descriptor type");
2828             continue;
2829          }
2830          assert(surface_state.map);
2831          bt_map[s] = surface_state.offset + state_offset;
2832          break;
2833       }
2834       }
2835    }
2836 
2837    return VK_SUCCESS;
2838 }
2839 
2840 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2841 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2842               struct anv_cmd_pipeline_state *pipe_state,
2843               struct anv_shader_bin *shader,
2844               struct anv_state *state)
2845 {
2846    struct anv_pipeline_bind_map *map = &shader->bind_map;
2847    if (map->sampler_count == 0) {
2848       *state = (struct anv_state) { 0, };
2849       return VK_SUCCESS;
2850    }
2851 
2852    uint32_t size = map->sampler_count * 16;
2853    *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2854 
2855    if (state->map == NULL)
2856       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2857 
2858    for (uint32_t s = 0; s < map->sampler_count; s++) {
2859       struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2860       const struct anv_descriptor *desc =
2861          &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2862 
2863       if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2864           desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2865          continue;
2866 
2867       struct anv_sampler *sampler = desc->sampler;
2868 
2869       /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2870        * happens to be zero.
2871        */
2872       if (sampler == NULL)
2873          continue;
2874 
2875       memcpy(state->map + (s * 16),
2876              sampler->state[binding->plane], sizeof(sampler->state[0]));
2877    }
2878 
2879    return VK_SUCCESS;
2880 }
2881 
2882 static uint32_t
flush_descriptor_sets(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const VkShaderStageFlags dirty,struct anv_shader_bin ** shaders,uint32_t num_shaders)2883 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
2884                       struct anv_cmd_pipeline_state *pipe_state,
2885                       const VkShaderStageFlags dirty,
2886                       struct anv_shader_bin **shaders,
2887                       uint32_t num_shaders)
2888 {
2889    VkShaderStageFlags flushed = 0;
2890 
2891    VkResult result = VK_SUCCESS;
2892    for (uint32_t i = 0; i < num_shaders; i++) {
2893       if (!shaders[i])
2894          continue;
2895 
2896       gl_shader_stage stage = shaders[i]->stage;
2897       VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2898       if ((vk_stage & dirty) == 0)
2899          continue;
2900 
2901       assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2902       result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2903                              &cmd_buffer->state.samplers[stage]);
2904       if (result != VK_SUCCESS)
2905          break;
2906 
2907       assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2908       result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2909                                   &cmd_buffer->state.binding_tables[stage]);
2910       if (result != VK_SUCCESS)
2911          break;
2912 
2913       flushed |= vk_stage;
2914    }
2915 
2916    if (result != VK_SUCCESS) {
2917       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2918 
2919       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2920       if (result != VK_SUCCESS)
2921          return 0;
2922 
2923       /* Re-emit state base addresses so we get the new surface state base
2924        * address before we start emitting binding tables etc.
2925        */
2926       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2927 
2928       /* Re-emit all active binding tables */
2929       flushed = 0;
2930 
2931       for (uint32_t i = 0; i < num_shaders; i++) {
2932          if (!shaders[i])
2933             continue;
2934 
2935          gl_shader_stage stage = shaders[i]->stage;
2936 
2937          result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2938                                 &cmd_buffer->state.samplers[stage]);
2939          if (result != VK_SUCCESS) {
2940             anv_batch_set_error(&cmd_buffer->batch, result);
2941             return 0;
2942          }
2943          result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2944                                      &cmd_buffer->state.binding_tables[stage]);
2945          if (result != VK_SUCCESS) {
2946             anv_batch_set_error(&cmd_buffer->batch, result);
2947             return 0;
2948          }
2949 
2950          flushed |= mesa_to_vk_shader_stage(stage);
2951       }
2952    }
2953 
2954    return flushed;
2955 }
2956 
2957 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)2958 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
2959                                     uint32_t stages)
2960 {
2961    static const uint32_t sampler_state_opcodes[] = {
2962       [MESA_SHADER_VERTEX]                      = 43,
2963       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
2964       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
2965       [MESA_SHADER_GEOMETRY]                    = 46,
2966       [MESA_SHADER_FRAGMENT]                    = 47,
2967    };
2968 
2969    static const uint32_t binding_table_opcodes[] = {
2970       [MESA_SHADER_VERTEX]                      = 38,
2971       [MESA_SHADER_TESS_CTRL]                   = 39,
2972       [MESA_SHADER_TESS_EVAL]                   = 40,
2973       [MESA_SHADER_GEOMETRY]                    = 41,
2974       [MESA_SHADER_FRAGMENT]                    = 42,
2975    };
2976 
2977    anv_foreach_stage(s, stages) {
2978       assert(s < ARRAY_SIZE(binding_table_opcodes));
2979 
2980       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
2981          anv_batch_emit(&cmd_buffer->batch,
2982                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
2983             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
2984             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
2985          }
2986       }
2987 
2988       /* Always emit binding table pointers if we're asked to, since on SKL
2989        * this is what flushes push constants. */
2990       anv_batch_emit(&cmd_buffer->batch,
2991                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
2992          btp._3DCommandSubOpcode = binding_table_opcodes[s];
2993          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
2994       }
2995    }
2996 }
2997 
2998 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)2999 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
3000                        const struct anv_shader_bin *shader,
3001                        const struct anv_push_range *range)
3002 {
3003    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3004    switch (range->set) {
3005    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
3006       /* This is a descriptor set buffer so the set index is
3007        * actually given by binding->binding.  (Yes, that's
3008        * confusing.)
3009        */
3010       struct anv_descriptor_set *set =
3011          gfx_state->base.descriptors[range->index];
3012       return anv_descriptor_set_address(set);
3013    }
3014 
3015    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
3016       if (gfx_state->base.push_constants_state.alloc_size == 0) {
3017          gfx_state->base.push_constants_state =
3018             anv_cmd_buffer_gfx_push_constants(cmd_buffer);
3019       }
3020       return (struct anv_address) {
3021          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3022          .offset = gfx_state->base.push_constants_state.offset,
3023       };
3024    }
3025 
3026    case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
3027       return (struct anv_address) {
3028          .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
3029          .offset = shader->kernel.offset +
3030                    shader->prog_data->const_data_offset,
3031       };
3032 
3033    default: {
3034       assert(range->set < MAX_SETS);
3035       struct anv_descriptor_set *set =
3036          gfx_state->base.descriptors[range->set];
3037       const struct anv_descriptor *desc =
3038          &set->descriptors[range->index];
3039 
3040       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
3041          if (desc->buffer_view)
3042             return desc->buffer_view->address;
3043       } else {
3044          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
3045          if (desc->buffer) {
3046             const struct anv_push_constants *push =
3047                &gfx_state->base.push_constants;
3048             uint32_t dynamic_offset =
3049                push->dynamic_offsets[range->dynamic_offset_index];
3050             return anv_address_add(desc->buffer->address,
3051                                    desc->offset + dynamic_offset);
3052          }
3053       }
3054 
3055       /* For NULL UBOs, we just return an address in the workaround BO.  We do
3056        * writes to it for workarounds but always at the bottom.  The higher
3057        * bytes should be all zeros.
3058        */
3059       assert(range->length * 32 <= 2048);
3060       return (struct anv_address) {
3061          .bo = cmd_buffer->device->workaround_bo,
3062          .offset = 1024,
3063       };
3064    }
3065    }
3066 }
3067 
3068 
3069 /** Returns the size in bytes of the bound buffer
3070  *
3071  * The range is relative to the start of the buffer, not the start of the
3072  * range.  The returned range may be smaller than
3073  *
3074  *    (range->start + range->length) * 32;
3075  */
3076 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)3077 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
3078                           const struct anv_shader_bin *shader,
3079                           const struct anv_push_range *range)
3080 {
3081    assert(shader->stage != MESA_SHADER_COMPUTE);
3082    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3083    switch (range->set) {
3084    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
3085       struct anv_descriptor_set *set =
3086          gfx_state->base.descriptors[range->index];
3087       assert(range->start * 32 < set->desc_mem.alloc_size);
3088       assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
3089       return set->desc_mem.alloc_size;
3090    }
3091 
3092    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
3093       return (range->start + range->length) * 32;
3094 
3095    case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
3096       return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
3097 
3098    default: {
3099       assert(range->set < MAX_SETS);
3100       struct anv_descriptor_set *set =
3101          gfx_state->base.descriptors[range->set];
3102       const struct anv_descriptor *desc =
3103          &set->descriptors[range->index];
3104 
3105       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
3106          /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
3107             * We use the descriptor set's internally allocated surface state to fill the binding table entry.
3108          */
3109          if (!desc->set_buffer_view)
3110             return 0;
3111 
3112          if (range->start * 32 > desc->set_buffer_view->range)
3113             return 0;
3114 
3115          return desc->set_buffer_view->range;
3116       } else {
3117          if (!desc->buffer)
3118             return 0;
3119 
3120          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
3121          /* Compute the offset within the buffer */
3122          const struct anv_push_constants *push =
3123             &gfx_state->base.push_constants;
3124          uint32_t dynamic_offset =
3125             push->dynamic_offsets[range->dynamic_offset_index];
3126          uint64_t offset = desc->offset + dynamic_offset;
3127          /* Clamp to the buffer size */
3128          offset = MIN2(offset, desc->buffer->size);
3129          /* Clamp the range to the buffer size */
3130          uint32_t bound_range = MIN2(desc->range, desc->buffer->size - offset);
3131 
3132          /* Align the range for consistency */
3133          bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT);
3134 
3135          return bound_range;
3136       }
3137    }
3138    }
3139 }
3140 
3141 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)3142 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
3143                               gl_shader_stage stage,
3144                               struct anv_address *buffers,
3145                               unsigned buffer_count)
3146 {
3147    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3148    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3149 
3150    static const uint32_t push_constant_opcodes[] = {
3151       [MESA_SHADER_VERTEX]                      = 21,
3152       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3153       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3154       [MESA_SHADER_GEOMETRY]                    = 22,
3155       [MESA_SHADER_FRAGMENT]                    = 23,
3156    };
3157 
3158    assert(stage < ARRAY_SIZE(push_constant_opcodes));
3159 
3160    UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
3161 
3162    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
3163       c._3DCommandSubOpcode = push_constant_opcodes[stage];
3164 
3165       /* Set MOCS, except on Gfx8, because the Broadwell PRM says:
3166        *
3167        *    "Constant Buffer Object Control State must be always
3168        *     programmed to zero."
3169        *
3170        * This restriction does not exist on any newer platforms.
3171        *
3172        * We only have one MOCS field for the whole packet, not one per
3173        * buffer.  We could go out of our way here to walk over all of
3174        * the buffers and see if any of them are used externally and use
3175        * the external MOCS.  However, the notion that someone would use
3176        * the same bit of memory for both scanout and a UBO is nuts.
3177        *
3178        * Let's not bother and assume it's all internal.
3179        */
3180 #if GFX_VER >= 9
3181       c.MOCS = mocs;
3182 #elif GFX_VER < 8
3183       c.ConstantBody.MOCS = mocs;
3184 #endif
3185 
3186       if (anv_pipeline_has_stage(pipeline, stage)) {
3187          const struct anv_pipeline_bind_map *bind_map =
3188             &pipeline->shaders[stage]->bind_map;
3189 
3190 #if GFX_VERx10 >= 75
3191          /* The Skylake PRM contains the following restriction:
3192           *
3193           *    "The driver must ensure The following case does not occur
3194           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3195           *     buffer 3 read length equal to zero committed followed by a
3196           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3197           *     zero committed."
3198           *
3199           * To avoid this, we program the buffers in the highest slots.
3200           * This way, slot 0 is only used if slot 3 is also used.
3201           */
3202          assert(buffer_count <= 4);
3203          const unsigned shift = 4 - buffer_count;
3204          for (unsigned i = 0; i < buffer_count; i++) {
3205             const struct anv_push_range *range = &bind_map->push_ranges[i];
3206 
3207             /* At this point we only have non-empty ranges */
3208             assert(range->length > 0);
3209 
3210             /* For Ivy Bridge, make sure we only set the first range (actual
3211              * push constants)
3212              */
3213             assert((GFX_VERx10 >= 75) || i == 0);
3214 
3215             c.ConstantBody.ReadLength[i + shift] = range->length;
3216             c.ConstantBody.Buffer[i + shift] =
3217                anv_address_add(buffers[i], range->start * 32);
3218          }
3219 #else
3220          /* For Ivy Bridge, push constants are relative to dynamic state
3221           * base address and we only ever push actual push constants.
3222           */
3223          if (bind_map->push_ranges[0].length > 0) {
3224             assert(buffer_count == 1);
3225             assert(bind_map->push_ranges[0].set ==
3226                    ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
3227             assert(buffers[0].bo ==
3228                    cmd_buffer->device->dynamic_state_pool.block_pool.bo);
3229             c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
3230             c.ConstantBody.Buffer[0].bo = NULL;
3231             c.ConstantBody.Buffer[0].offset = buffers[0].offset;
3232          }
3233          assert(bind_map->push_ranges[1].length == 0);
3234          assert(bind_map->push_ranges[2].length == 0);
3235          assert(bind_map->push_ranges[3].length == 0);
3236 #endif
3237       }
3238    }
3239 }
3240 
3241 #if GFX_VER >= 12
3242 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer * cmd_buffer,uint32_t shader_mask,struct anv_address * buffers,uint32_t buffer_count)3243 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
3244                                   uint32_t shader_mask,
3245                                   struct anv_address *buffers,
3246                                   uint32_t buffer_count)
3247 {
3248    if (buffer_count == 0) {
3249       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
3250          c.ShaderUpdateEnable = shader_mask;
3251          c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
3252       }
3253       return;
3254    }
3255 
3256    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3257    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3258 
3259    static const UNUSED uint32_t push_constant_opcodes[] = {
3260       [MESA_SHADER_VERTEX]                      = 21,
3261       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3262       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3263       [MESA_SHADER_GEOMETRY]                    = 22,
3264       [MESA_SHADER_FRAGMENT]                    = 23,
3265    };
3266 
3267    gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
3268    assert(stage < ARRAY_SIZE(push_constant_opcodes));
3269 
3270    const struct anv_pipeline_bind_map *bind_map =
3271       &pipeline->shaders[stage]->bind_map;
3272 
3273    uint32_t *dw;
3274    const uint32_t buffer_mask = (1 << buffer_count) - 1;
3275    const uint32_t num_dwords = 2 + 2 * buffer_count;
3276 
3277    dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3278                         GENX(3DSTATE_CONSTANT_ALL),
3279                         .ShaderUpdateEnable = shader_mask,
3280                         .PointerBufferMask = buffer_mask,
3281                         .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
3282 
3283    for (int i = 0; i < buffer_count; i++) {
3284       const struct anv_push_range *range = &bind_map->push_ranges[i];
3285       GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
3286          &cmd_buffer->batch, dw + 2 + i * 2,
3287          &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
3288             .PointerToConstantBuffer =
3289                anv_address_add(buffers[i], range->start * 32),
3290             .ConstantBufferReadLength = range->length,
3291          });
3292    }
3293 }
3294 #endif
3295 
3296 static void
cmd_buffer_flush_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)3297 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
3298                                 VkShaderStageFlags dirty_stages)
3299 {
3300    VkShaderStageFlags flushed = 0;
3301    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3302    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3303 
3304 #if GFX_VER >= 12
3305    uint32_t nobuffer_stages = 0;
3306 #endif
3307 
3308    /* Compute robust pushed register access mask for each stage. */
3309    if (cmd_buffer->device->robust_buffer_access) {
3310       anv_foreach_stage(stage, dirty_stages) {
3311          if (!anv_pipeline_has_stage(pipeline, stage))
3312             continue;
3313 
3314          const struct anv_shader_bin *shader = pipeline->shaders[stage];
3315          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3316          struct anv_push_constants *push = &gfx_state->base.push_constants;
3317 
3318          push->push_reg_mask[stage] = 0;
3319          /* Start of the current range in the shader, relative to the start of
3320           * push constants in the shader.
3321           */
3322          unsigned range_start_reg = 0;
3323          for (unsigned i = 0; i < 4; i++) {
3324             const struct anv_push_range *range = &bind_map->push_ranges[i];
3325             if (range->length == 0)
3326                continue;
3327 
3328             unsigned bound_size =
3329                get_push_range_bound_size(cmd_buffer, shader, range);
3330             if (bound_size >= range->start * 32) {
3331                unsigned bound_regs =
3332                   MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
3333                        range->length);
3334                assert(range_start_reg + bound_regs <= 64);
3335                push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
3336                                                               bound_regs);
3337             }
3338 
3339             cmd_buffer->state.push_constants_dirty |=
3340                mesa_to_vk_shader_stage(stage);
3341 
3342             range_start_reg += range->length;
3343          }
3344       }
3345    }
3346 
3347    /* Resets the push constant state so that we allocate a new one if
3348     * needed.
3349     */
3350    gfx_state->base.push_constants_state = ANV_STATE_NULL;
3351 
3352    anv_foreach_stage(stage, dirty_stages) {
3353       unsigned buffer_count = 0;
3354       flushed |= mesa_to_vk_shader_stage(stage);
3355       UNUSED uint32_t max_push_range = 0;
3356 
3357       struct anv_address buffers[4] = {};
3358       if (anv_pipeline_has_stage(pipeline, stage)) {
3359          const struct anv_shader_bin *shader = pipeline->shaders[stage];
3360          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3361 
3362          /* We have to gather buffer addresses as a second step because the
3363           * loop above puts data into the push constant area and the call to
3364           * get_push_range_address is what locks our push constants and copies
3365           * them into the actual GPU buffer.  If we did the two loops at the
3366           * same time, we'd risk only having some of the sizes in the push
3367           * constant buffer when we did the copy.
3368           */
3369          for (unsigned i = 0; i < 4; i++) {
3370             const struct anv_push_range *range = &bind_map->push_ranges[i];
3371             if (range->length == 0)
3372                break;
3373 
3374             buffers[i] = get_push_range_address(cmd_buffer, shader, range);
3375             max_push_range = MAX2(max_push_range, range->length);
3376             buffer_count++;
3377          }
3378 
3379          /* We have at most 4 buffers but they should be tightly packed */
3380          for (unsigned i = buffer_count; i < 4; i++)
3381             assert(bind_map->push_ranges[i].length == 0);
3382       }
3383 
3384 #if GFX_VER >= 12
3385       /* If this stage doesn't have any push constants, emit it later in a
3386        * single CONSTANT_ALL packet.
3387        */
3388       if (buffer_count == 0) {
3389          nobuffer_stages |= 1 << stage;
3390          continue;
3391       }
3392 
3393       /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
3394        * contains only 5 bits, so we can only use it for buffers smaller than
3395        * 32.
3396        */
3397       if (max_push_range < 32) {
3398          cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
3399                                            buffers, buffer_count);
3400          continue;
3401       }
3402 #endif
3403 
3404       cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
3405    }
3406 
3407 #if GFX_VER >= 12
3408    if (nobuffer_stages)
3409       cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
3410 #endif
3411 
3412    cmd_buffer->state.push_constants_dirty &= ~flushed;
3413 }
3414 
3415 #if GFX_VERx10 >= 125
3416 static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)3417 cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
3418                                   VkShaderStageFlags dirty_stages)
3419 {
3420    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3421    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3422 
3423    if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_NV &&
3424        anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
3425 
3426       const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_TASK];
3427       const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3428 
3429       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
3430          const struct anv_push_range *range = &bind_map->push_ranges[0];
3431          if (range->length > 0) {
3432             struct anv_address buffer =
3433                get_push_range_address(cmd_buffer, shader, range);
3434 
3435             uint64_t addr = anv_address_physical(buffer);
3436             data.InlineData[0] = addr & 0xffffffff;
3437             data.InlineData[1] = addr >> 32;
3438 
3439             memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
3440                    cmd_buffer->state.gfx.base.push_constants.client_data,
3441                    BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
3442          }
3443       }
3444    }
3445 
3446    if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_NV &&
3447        anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
3448 
3449       const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_MESH];
3450       const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3451 
3452       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
3453          const struct anv_push_range *range = &bind_map->push_ranges[0];
3454          if (range->length > 0) {
3455             struct anv_address buffer =
3456                get_push_range_address(cmd_buffer, shader, range);
3457 
3458             uint64_t addr = anv_address_physical(buffer);
3459             data.InlineData[0] = addr & 0xffffffff;
3460             data.InlineData[1] = addr >> 32;
3461 
3462             memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
3463                    cmd_buffer->state.gfx.base.push_constants.client_data,
3464                    BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
3465          }
3466       }
3467    }
3468 
3469    cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
3470 }
3471 #endif
3472 
3473 static void
cmd_buffer_emit_clip(struct anv_cmd_buffer * cmd_buffer)3474 cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
3475 {
3476    const uint32_t clip_states =
3477 #if GFX_VER <= 7
3478       ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
3479       ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |
3480 #endif
3481       ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
3482       ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
3483       ANV_CMD_DIRTY_PIPELINE;
3484 
3485    if ((cmd_buffer->state.gfx.dirty & clip_states) == 0)
3486       return;
3487 
3488    /* Take dynamic primitive topology in to account with
3489     *    3DSTATE_CLIP::ViewportXYClipTestEnable
3490     */
3491    bool xy_clip_test_enable = 0;
3492 
3493    if (cmd_buffer->state.gfx.pipeline->dynamic_states &
3494        ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
3495       VkPrimitiveTopology primitive_topology =
3496          cmd_buffer->state.gfx.dynamic.primitive_topology;
3497 
3498       VkPolygonMode dynamic_raster_mode =
3499          genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
3500                                    primitive_topology);
3501 
3502       xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
3503    }
3504 
3505 #if GFX_VER <= 7
3506    const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
3507 #endif
3508    struct GENX(3DSTATE_CLIP) clip = {
3509       GENX(3DSTATE_CLIP_header),
3510 #if GFX_VER <= 7
3511       .FrontWinding = genX(vk_to_intel_front_face)[d->front_face],
3512       .CullMode     = genX(vk_to_intel_cullmode)[d->cull_mode],
3513 #endif
3514       .ViewportXYClipTestEnable = xy_clip_test_enable,
3515    };
3516    uint32_t dwords[GENX(3DSTATE_CLIP_length)];
3517 
3518    /* TODO(mesh): Multiview. */
3519    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3520    if (anv_pipeline_is_primitive(pipeline)) {
3521       const struct brw_vue_prog_data *last =
3522          anv_pipeline_get_last_vue_prog_data(pipeline);
3523       if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3524          clip.MaximumVPIndex =
3525             cmd_buffer->state.gfx.dynamic.viewport.count > 0 ?
3526             cmd_buffer->state.gfx.dynamic.viewport.count - 1 : 0;
3527       }
3528    } else if (anv_pipeline_is_mesh(pipeline)) {
3529       const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
3530       if (mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
3531          uint32_t viewport_count = cmd_buffer->state.gfx.dynamic.viewport.count;
3532          clip.MaximumVPIndex = viewport_count > 0 ? viewport_count - 1 : 0;
3533       }
3534    }
3535 
3536    GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
3537    anv_batch_emit_merge(&cmd_buffer->batch, dwords,
3538                         pipeline->gfx7.clip);
3539 }
3540 
3541 static void
cmd_buffer_emit_viewport(struct anv_cmd_buffer * cmd_buffer)3542 cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
3543 {
3544    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3545    uint32_t count = gfx->dynamic.viewport.count;
3546    const VkViewport *viewports = gfx->dynamic.viewport.viewports;
3547    struct anv_state sf_clip_state =
3548       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
3549 
3550    bool negative_one_to_one =
3551       cmd_buffer->state.gfx.pipeline->negative_one_to_one;
3552 
3553    float scale = negative_one_to_one ? 0.5f : 1.0f;
3554 
3555    for (uint32_t i = 0; i < count; i++) {
3556       const VkViewport *vp = &viewports[i];
3557 
3558       /* The gfx7 state struct has just the matrix and guardband fields, the
3559        * gfx8 struct adds the min/max viewport fields. */
3560       struct GENX(SF_CLIP_VIEWPORT) sfv = {
3561          .ViewportMatrixElementm00 = vp->width / 2,
3562          .ViewportMatrixElementm11 = vp->height / 2,
3563          .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
3564          .ViewportMatrixElementm30 = vp->x + vp->width / 2,
3565          .ViewportMatrixElementm31 = vp->y + vp->height / 2,
3566          .ViewportMatrixElementm32 = negative_one_to_one ?
3567             (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
3568          .XMinClipGuardband = -1.0f,
3569          .XMaxClipGuardband = 1.0f,
3570          .YMinClipGuardband = -1.0f,
3571          .YMaxClipGuardband = 1.0f,
3572 #if GFX_VER >= 8
3573          .XMinViewPort = vp->x,
3574          .XMaxViewPort = vp->x + vp->width - 1,
3575          .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
3576          .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
3577 #endif
3578       };
3579 
3580       const uint32_t fb_size_max = 1 << 14;
3581       uint32_t x_min = 0, x_max = fb_size_max;
3582       uint32_t y_min = 0, y_max = fb_size_max;
3583 
3584       /* If we have a valid renderArea, include that */
3585       if (gfx->render_area.extent.width > 0 &&
3586           gfx->render_area.extent.height > 0) {
3587          x_min = MAX2(x_min, gfx->render_area.offset.x);
3588          x_max = MIN2(x_min, gfx->render_area.offset.x +
3589                              gfx->render_area.extent.width);
3590          y_min = MAX2(y_min, gfx->render_area.offset.y);
3591          y_max = MIN2(y_min, gfx->render_area.offset.y +
3592                              gfx->render_area.extent.height);
3593       }
3594 
3595       /* The client is required to have enough scissors for whatever it sets
3596        * as ViewportIndex but it's possible that they've got more viewports
3597        * set from a previous command.  Also, from the Vulkan 1.3.207:
3598        *
3599        *    "The application must ensure (using scissor if necessary) that
3600        *    all rendering is contained within the render area."
3601        *
3602        * If the client doesn't set a scissor, that basically means it
3603        * guarantees everything is in-bounds already.  If we end up using a
3604        * guardband of [-1, 1] in that case, there shouldn't be much loss.
3605        * It's theoretically possible that they could do all their clipping
3606        * with clip planes but that'd be a bit odd.
3607        */
3608       if (i < gfx->dynamic.scissor.count) {
3609          const VkRect2D *scissor = &gfx->dynamic.scissor.scissors[i];
3610          x_min = MAX2(x_min, scissor->offset.x);
3611          x_max = MIN2(x_min, scissor->offset.x + scissor->extent.width);
3612          y_min = MAX2(y_min, scissor->offset.y);
3613          y_max = MIN2(y_min, scissor->offset.y + scissor->extent.height);
3614       }
3615 
3616       /* Only bother calculating the guardband if our known render area is
3617        * less than the maximum size.  Otherwise, it will calculate [-1, 1]
3618        * anyway but possibly with precision loss.
3619        */
3620       if (x_min > 0 || x_max < fb_size_max ||
3621           y_min > 0 || y_max < fb_size_max) {
3622          intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
3623                                         sfv.ViewportMatrixElementm00,
3624                                         sfv.ViewportMatrixElementm11,
3625                                         sfv.ViewportMatrixElementm30,
3626                                         sfv.ViewportMatrixElementm31,
3627                                         &sfv.XMinClipGuardband,
3628                                         &sfv.XMaxClipGuardband,
3629                                         &sfv.YMinClipGuardband,
3630                                         &sfv.YMaxClipGuardband);
3631       }
3632 
3633       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
3634    }
3635 
3636    anv_batch_emit(&cmd_buffer->batch,
3637                   GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
3638       clip.SFClipViewportPointer = sf_clip_state.offset;
3639    }
3640 }
3641 
3642 static void
cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer * cmd_buffer,bool depth_clamp_enable)3643 cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
3644                                bool depth_clamp_enable)
3645 {
3646    uint32_t count = cmd_buffer->state.gfx.dynamic.viewport.count;
3647    const VkViewport *viewports =
3648       cmd_buffer->state.gfx.dynamic.viewport.viewports;
3649    struct anv_state cc_state =
3650       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
3651 
3652    for (uint32_t i = 0; i < count; i++) {
3653       const VkViewport *vp = &viewports[i];
3654 
3655       /* From the Vulkan spec:
3656        *
3657        *    "It is valid for minDepth to be greater than or equal to
3658        *    maxDepth."
3659        */
3660       float min_depth = MIN2(vp->minDepth, vp->maxDepth);
3661       float max_depth = MAX2(vp->minDepth, vp->maxDepth);
3662 
3663       struct GENX(CC_VIEWPORT) cc_viewport = {
3664          .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
3665          .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
3666       };
3667 
3668       GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
3669    }
3670 
3671    anv_batch_emit(&cmd_buffer->batch,
3672                   GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
3673       cc.CCViewportPointer = cc_state.offset;
3674    }
3675 }
3676 
3677 static int64_t
clamp_int64(int64_t x,int64_t min,int64_t max)3678 clamp_int64(int64_t x, int64_t min, int64_t max)
3679 {
3680    if (x < min)
3681       return min;
3682    else if (x < max)
3683       return x;
3684    else
3685       return max;
3686 }
3687 
3688 static void
cmd_buffer_emit_scissor(struct anv_cmd_buffer * cmd_buffer)3689 cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
3690 {
3691    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3692    uint32_t count = gfx->dynamic.scissor.count;
3693    const VkRect2D *scissors = gfx->dynamic.scissor.scissors;
3694    const VkViewport *viewports = gfx->dynamic.viewport.viewports;
3695 
3696    /* Wa_1409725701:
3697     *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
3698     *    stored as an array of up to 16 elements. The location of first
3699     *    element of the array, as specified by Pointer to SCISSOR_RECT, should
3700     *    be aligned to a 64-byte boundary.
3701     */
3702    uint32_t alignment = 64;
3703    struct anv_state scissor_state =
3704       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
3705 
3706    for (uint32_t i = 0; i < count; i++) {
3707       const VkRect2D *s = &scissors[i];
3708       const VkViewport *vp = &viewports[i];
3709 
3710       /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
3711        * ymax < ymin for empty clips.  In case clip x, y, width height are all
3712        * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
3713        * what we want. Just special case empty clips and produce a canonical
3714        * empty clip. */
3715       static const struct GENX(SCISSOR_RECT) empty_scissor = {
3716          .ScissorRectangleYMin = 1,
3717          .ScissorRectangleXMin = 1,
3718          .ScissorRectangleYMax = 0,
3719          .ScissorRectangleXMax = 0
3720       };
3721 
3722       const int max = 0xffff;
3723 
3724       uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
3725       uint32_t x_min = MAX2(s->offset.x, vp->x);
3726       uint32_t y_max = MIN2(s->offset.y + s->extent.height - 1,
3727                        MAX2(vp->y, vp->y + vp->height) - 1);
3728       uint32_t x_max = MIN2(s->offset.x + s->extent.width - 1,
3729                        vp->x + vp->width - 1);
3730 
3731       /* Do this math using int64_t so overflow gets clamped correctly. */
3732       if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3733          y_min = clamp_int64((uint64_t) y_min, gfx->render_area.offset.y, max);
3734          x_min = clamp_int64((uint64_t) x_min, gfx->render_area.offset.x, max);
3735          y_max = clamp_int64((uint64_t) y_max, 0,
3736                              gfx->render_area.offset.y +
3737                              gfx->render_area.extent.height - 1);
3738          x_max = clamp_int64((uint64_t) x_max, 0,
3739                              gfx->render_area.offset.x +
3740                              gfx->render_area.extent.width - 1);
3741       }
3742 
3743       struct GENX(SCISSOR_RECT) scissor = {
3744          .ScissorRectangleYMin = y_min,
3745          .ScissorRectangleXMin = x_min,
3746          .ScissorRectangleYMax = y_max,
3747          .ScissorRectangleXMax = x_max
3748       };
3749 
3750       if (s->extent.width <= 0 || s->extent.height <= 0) {
3751          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8,
3752                                  &empty_scissor);
3753       } else {
3754          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
3755       }
3756    }
3757 
3758    anv_batch_emit(&cmd_buffer->batch,
3759                   GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
3760       ssp.ScissorRectPointer = scissor_state.offset;
3761    }
3762 }
3763 
3764 static void
cmd_buffer_emit_streamout(struct anv_cmd_buffer * cmd_buffer)3765 cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
3766 {
3767    const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
3768    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3769 
3770 #if GFX_VER == 7
3771 #  define streamout_state_dw pipeline->gfx7.streamout_state
3772 #else
3773 #  define streamout_state_dw pipeline->gfx8.streamout_state
3774 #endif
3775 
3776    uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
3777 
3778    struct GENX(3DSTATE_STREAMOUT) so = {
3779       GENX(3DSTATE_STREAMOUT_header),
3780       .RenderingDisable = d->raster_discard,
3781    };
3782    GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
3783    anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
3784 }
3785 
3786 void
genX(cmd_buffer_flush_state)3787 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
3788 {
3789    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3790    uint32_t *p;
3791 
3792    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
3793 
3794    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
3795 
3796    genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
3797 
3798    genX(flush_pipeline_select_3d)(cmd_buffer);
3799 
3800    /* Apply any pending pipeline flushes we may have.  We want to apply them
3801     * now because, if any of those flushes are for things like push constants,
3802     * the GPU will read the state at weird times.
3803     */
3804    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3805 
3806    uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
3807    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3808       vb_emit |= pipeline->vb_used;
3809 
3810    if (vb_emit) {
3811       const uint32_t num_buffers = __builtin_popcount(vb_emit);
3812       const uint32_t num_dwords = 1 + num_buffers * 4;
3813 
3814       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3815                           GENX(3DSTATE_VERTEX_BUFFERS));
3816       uint32_t i = 0;
3817       u_foreach_bit(vb, vb_emit) {
3818          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
3819          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
3820 
3821          /* If dynamic, use stride/size from vertex binding, otherwise use
3822           * stride/size that was setup in the pipeline object.
3823           */
3824          bool dynamic_stride = cmd_buffer->state.gfx.dynamic.dyn_vbo_stride;
3825          bool dynamic_size = cmd_buffer->state.gfx.dynamic.dyn_vbo_size;
3826 
3827          struct GENX(VERTEX_BUFFER_STATE) state;
3828          if (buffer) {
3829             uint32_t stride = dynamic_stride ?
3830                cmd_buffer->state.vertex_bindings[vb].stride : pipeline->vb[vb].stride;
3831             /* From the Vulkan spec (vkCmdBindVertexBuffers2EXT):
3832              *
3833              * "If pname:pSizes is not NULL then pname:pSizes[i] specifies
3834              * the bound size of the vertex buffer starting from the corresponding
3835              * elements of pname:pBuffers[i] plus pname:pOffsets[i]."
3836              */
3837             UNUSED uint32_t size = dynamic_size ?
3838                cmd_buffer->state.vertex_bindings[vb].size : buffer->size - offset;
3839 
3840             state = (struct GENX(VERTEX_BUFFER_STATE)) {
3841                .VertexBufferIndex = vb,
3842 
3843                .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
3844                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3845 #if GFX_VER <= 7
3846                .BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA,
3847                .InstanceDataStepRate = pipeline->vb[vb].instance_divisor,
3848 #endif
3849                .AddressModifyEnable = true,
3850                .BufferPitch = stride,
3851                .BufferStartingAddress = anv_address_add(buffer->address, offset),
3852                .NullVertexBuffer = offset >= buffer->size,
3853 #if GFX_VER >= 12
3854                .L3BypassDisable = true,
3855 #endif
3856 
3857 #if GFX_VER >= 8
3858                .BufferSize = size,
3859 #else
3860                /* XXX: to handle dynamic offset for older gens we might want
3861                 * to modify Endaddress, but there are issues when doing so:
3862                 *
3863                 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
3864                 */
3865                .EndAddress = anv_address_add(buffer->address, buffer->size - 1),
3866 #endif
3867             };
3868          } else {
3869             state = (struct GENX(VERTEX_BUFFER_STATE)) {
3870                .VertexBufferIndex = vb,
3871                .NullVertexBuffer = true,
3872                .MOCS = anv_mocs(cmd_buffer->device, NULL,
3873                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3874             };
3875          }
3876 
3877 #if GFX_VER >= 8 && GFX_VER <= 9
3878          genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
3879                                                         state.BufferStartingAddress,
3880                                                         state.BufferSize);
3881 #endif
3882 
3883          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
3884          i++;
3885       }
3886    }
3887 
3888    cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
3889 
3890    uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
3891                                 pipeline->active_stages;
3892    if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
3893        !cmd_buffer->state.push_constants_dirty)
3894       return;
3895 
3896    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
3897        (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
3898                          ANV_CMD_DIRTY_PIPELINE))) {
3899       /* Wa_16011411144:
3900        *
3901        * SW must insert a PIPE_CONTROL cmd before and after the
3902        * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
3903        * state is not combined with other state changes.
3904        */
3905       if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
3906          anv_add_pending_pipe_bits(cmd_buffer,
3907                                    ANV_PIPE_CS_STALL_BIT,
3908                                    "before SO_BUFFER change WA");
3909          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3910       }
3911 
3912       /* We don't need any per-buffer dirty tracking because you're not
3913        * allowed to bind different XFB buffers while XFB is enabled.
3914        */
3915       for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
3916          struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
3917          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
3918 #if GFX_VER < 12
3919             sob.SOBufferIndex = idx;
3920 #else
3921             sob._3DCommandOpcode = 0;
3922             sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
3923 #endif
3924 
3925             if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
3926                sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0);
3927                sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
3928                                                         xfb->offset);
3929 #if GFX_VER >= 8
3930                sob.SOBufferEnable = true;
3931                sob.StreamOffsetWriteEnable = false;
3932                /* Size is in DWords - 1 */
3933                sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
3934 #else
3935                /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
3936                 * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
3937                 * default for an empty SO_BUFFER packet) to disable them.
3938                 */
3939                sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
3940                sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
3941                                                        xfb->offset + xfb->size);
3942 #endif
3943             } else {
3944                sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
3945             }
3946          }
3947       }
3948 
3949       if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
3950          /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
3951          anv_add_pending_pipe_bits(cmd_buffer,
3952                                    ANV_PIPE_CS_STALL_BIT,
3953                                    "after SO_BUFFER change WA");
3954          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3955       } else if (GFX_VER >= 10) {
3956          /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
3957          anv_add_pending_pipe_bits(cmd_buffer,
3958                                    ANV_PIPE_CS_STALL_BIT,
3959                                    "after 3DSTATE_SO_BUFFER call");
3960       }
3961    }
3962 
3963    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
3964       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
3965 
3966       /* If the pipeline changed, we may need to re-allocate push constant
3967        * space in the URB.
3968        */
3969       cmd_buffer_alloc_push_constants(cmd_buffer);
3970    }
3971 
3972    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3973       cmd_buffer->state.gfx.primitive_topology = pipeline->topology;
3974 
3975 #if GFX_VER <= 7
3976    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
3977        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
3978       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
3979        *
3980        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
3981        *    stall needs to be sent just prior to any 3DSTATE_VS,
3982        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
3983        *    3DSTATE_BINDING_TABLE_POINTER_VS,
3984        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
3985        *    PIPE_CONTROL needs to be sent before any combination of VS
3986        *    associated 3DSTATE."
3987        */
3988       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3989          pc.DepthStallEnable  = true;
3990          pc.PostSyncOperation = WriteImmediateData;
3991          pc.Address           = cmd_buffer->device->workaround_address;
3992          anv_debug_dump_pc(pc);
3993       }
3994    }
3995 #endif
3996 
3997    /* Render targets live in the same binding table as fragment descriptors */
3998    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
3999       descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
4000 
4001    /* We emit the binding tables and sampler tables first, then emit push
4002     * constants and then finally emit binding table and sampler table
4003     * pointers.  It has to happen in this order, since emitting the binding
4004     * tables may change the push constants (in case of storage images). After
4005     * emitting push constants, on SKL+ we have to emit the corresponding
4006     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
4007     */
4008    uint32_t dirty = 0;
4009    if (descriptors_dirty) {
4010       dirty = flush_descriptor_sets(cmd_buffer,
4011                                     &cmd_buffer->state.gfx.base,
4012                                     descriptors_dirty,
4013                                     pipeline->shaders,
4014                                     ARRAY_SIZE(pipeline->shaders));
4015       cmd_buffer->state.descriptors_dirty &= ~dirty;
4016    }
4017 
4018    if (dirty || cmd_buffer->state.push_constants_dirty) {
4019       /* Because we're pushing UBOs, we have to push whenever either
4020        * descriptors or push constants is dirty.
4021        */
4022       dirty |= cmd_buffer->state.push_constants_dirty;
4023       cmd_buffer_flush_push_constants(cmd_buffer,
4024                                       dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
4025 #if GFX_VERx10 >= 125
4026       cmd_buffer_flush_mesh_inline_data(
4027          cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_NV |
4028                               VK_SHADER_STAGE_MESH_BIT_NV));
4029 #endif
4030    }
4031 
4032    if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
4033       cmd_buffer_emit_descriptor_pointers(cmd_buffer,
4034                                           dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
4035    }
4036 
4037    cmd_buffer_emit_clip(cmd_buffer);
4038 
4039    if (anv_cmd_buffer_needs_dynamic_state(cmd_buffer,
4040                                           ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE))
4041       cmd_buffer_emit_streamout(cmd_buffer);
4042 
4043    if (anv_cmd_buffer_needs_dynamic_state(cmd_buffer,
4044                                           ANV_CMD_DIRTY_DYNAMIC_SCISSOR |
4045                                           ANV_CMD_DIRTY_RENDER_TARGETS |
4046                                           ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)) {
4047       cmd_buffer_emit_viewport(cmd_buffer);
4048       cmd_buffer_emit_depth_viewport(cmd_buffer,
4049                                      pipeline->depth_clamp_enable);
4050    }
4051 
4052    if (anv_cmd_buffer_needs_dynamic_state(cmd_buffer,
4053                                           ANV_CMD_DIRTY_DYNAMIC_SCISSOR |
4054                                           ANV_CMD_DIRTY_RENDER_TARGETS |
4055                                           ANV_CMD_DIRTY_DYNAMIC_VIEWPORT))
4056       cmd_buffer_emit_scissor(cmd_buffer);
4057 
4058    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
4059 }
4060 
4061 static void
emit_vertex_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,uint32_t size,uint32_t index)4062 emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
4063                struct anv_address addr,
4064                uint32_t size, uint32_t index)
4065 {
4066    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
4067                                  GENX(3DSTATE_VERTEX_BUFFERS));
4068 
4069    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
4070       &(struct GENX(VERTEX_BUFFER_STATE)) {
4071          .VertexBufferIndex = index,
4072          .AddressModifyEnable = true,
4073          .BufferPitch = 0,
4074          .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
4075                           ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
4076          .NullVertexBuffer = size == 0,
4077 #if GFX_VER >= 12
4078          .L3BypassDisable = true,
4079 #endif
4080 #if (GFX_VER >= 8)
4081          .BufferStartingAddress = addr,
4082          .BufferSize = size
4083 #else
4084          .BufferStartingAddress = addr,
4085          .EndAddress = anv_address_add(addr, size),
4086 #endif
4087       });
4088 
4089    genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
4090                                                   index, addr, size);
4091 }
4092 
4093 static void
emit_base_vertex_instance_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)4094 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
4095                              struct anv_address addr)
4096 {
4097    emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
4098 }
4099 
4100 static void
emit_base_vertex_instance(struct anv_cmd_buffer * cmd_buffer,uint32_t base_vertex,uint32_t base_instance)4101 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
4102                           uint32_t base_vertex, uint32_t base_instance)
4103 {
4104    if (base_vertex == 0 && base_instance == 0) {
4105       emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
4106    } else {
4107       struct anv_state id_state =
4108          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
4109 
4110       ((uint32_t *)id_state.map)[0] = base_vertex;
4111       ((uint32_t *)id_state.map)[1] = base_instance;
4112 
4113       struct anv_address addr = {
4114          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4115          .offset = id_state.offset,
4116       };
4117 
4118       emit_base_vertex_instance_bo(cmd_buffer, addr);
4119    }
4120 }
4121 
4122 static void
emit_draw_index(struct anv_cmd_buffer * cmd_buffer,uint32_t draw_index)4123 emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
4124 {
4125    struct anv_state state =
4126       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
4127 
4128    ((uint32_t *)state.map)[0] = draw_index;
4129 
4130    struct anv_address addr = {
4131       .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4132       .offset = state.offset,
4133    };
4134 
4135    emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
4136 }
4137 
4138 static void
update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer * cmd_buffer,uint32_t access_type)4139 update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
4140                                    uint32_t access_type)
4141 {
4142    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4143    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4144 
4145    uint64_t vb_used = pipeline->vb_used;
4146    if (vs_prog_data->uses_firstvertex ||
4147        vs_prog_data->uses_baseinstance)
4148       vb_used |= 1ull << ANV_SVGS_VB_INDEX;
4149    if (vs_prog_data->uses_drawid)
4150       vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
4151 
4152    genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
4153                                                        access_type == RANDOM,
4154                                                        vb_used);
4155 }
4156 
4157 ALWAYS_INLINE static void
cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer * cmd_buffer,const struct brw_vs_prog_data * vs_prog_data,uint32_t base_vertex,uint32_t base_instance,uint32_t draw_id,bool force_flush)4158 cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
4159                                            const struct brw_vs_prog_data *vs_prog_data,
4160                                            uint32_t base_vertex,
4161                                            uint32_t base_instance,
4162                                            uint32_t draw_id,
4163                                            bool force_flush)
4164 {
4165    bool emitted = false;
4166    if (vs_prog_data->uses_firstvertex ||
4167        vs_prog_data->uses_baseinstance) {
4168       emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
4169       emitted = true;
4170    }
4171    if (vs_prog_data->uses_drawid) {
4172       emit_draw_index(cmd_buffer, draw_id);
4173       emitted = true;
4174    }
4175    /* Emitting draw index or vertex index BOs may result in needing
4176     * additional VF cache flushes.
4177     */
4178    if (emitted || force_flush)
4179       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4180 }
4181 
4182 static unsigned
anv_cmd_buffer_get_view_count(struct anv_cmd_buffer * cmd_buffer)4183 anv_cmd_buffer_get_view_count(struct anv_cmd_buffer *cmd_buffer)
4184 {
4185    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4186    return MAX2(1, util_bitcount(gfx->view_mask));
4187 }
4188 
genX(CmdDraw)4189 void genX(CmdDraw)(
4190     VkCommandBuffer                             commandBuffer,
4191     uint32_t                                    vertexCount,
4192     uint32_t                                    instanceCount,
4193     uint32_t                                    firstVertex,
4194     uint32_t                                    firstInstance)
4195 {
4196    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4197    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4198    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4199 
4200    if (anv_batch_has_error(&cmd_buffer->batch))
4201       return;
4202 
4203    const uint32_t count = (vertexCount *
4204                            instanceCount *
4205                            (pipeline->use_primitive_replication ?
4206                             1 : anv_cmd_buffer_get_view_count(cmd_buffer)));
4207    anv_measure_snapshot(cmd_buffer,
4208                         INTEL_SNAPSHOT_DRAW,
4209                         "draw", count);
4210    trace_intel_begin_draw(&cmd_buffer->trace, cmd_buffer);
4211 
4212    genX(cmd_buffer_flush_state)(cmd_buffer);
4213 
4214    if (cmd_buffer->state.conditional_render_enabled)
4215       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4216 
4217    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4218                                               firstVertex, firstInstance, 0,
4219                                               true);
4220 
4221    /* Our implementation of VK_KHR_multiview uses instancing to draw the
4222     * different views.  We need to multiply instanceCount by the view count.
4223     */
4224    if (!pipeline->use_primitive_replication)
4225       instanceCount *= anv_cmd_buffer_get_view_count(cmd_buffer);
4226 
4227    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4228       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4229       prim.VertexAccessType         = SEQUENTIAL;
4230       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4231       prim.VertexCountPerInstance   = vertexCount;
4232       prim.StartVertexLocation      = firstVertex;
4233       prim.InstanceCount            = instanceCount;
4234       prim.StartInstanceLocation    = firstInstance;
4235       prim.BaseVertexLocation       = 0;
4236    }
4237 
4238    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4239 
4240    trace_intel_end_draw(&cmd_buffer->trace, cmd_buffer, count);
4241 }
4242 
genX(CmdDrawMultiEXT)4243 void genX(CmdDrawMultiEXT)(
4244     VkCommandBuffer                             commandBuffer,
4245     uint32_t                                    drawCount,
4246     const VkMultiDrawInfoEXT                   *pVertexInfo,
4247     uint32_t                                    instanceCount,
4248     uint32_t                                    firstInstance,
4249     uint32_t                                    stride)
4250 {
4251    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4252    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4253    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4254 
4255    if (anv_batch_has_error(&cmd_buffer->batch))
4256       return;
4257 
4258    const uint32_t count = (drawCount *
4259                            instanceCount *
4260                            (pipeline->use_primitive_replication ?
4261                             1 : anv_cmd_buffer_get_view_count(cmd_buffer)));
4262    anv_measure_snapshot(cmd_buffer,
4263                         INTEL_SNAPSHOT_DRAW,
4264                         "draw_multi", count);
4265    trace_intel_begin_draw_multi(&cmd_buffer->trace, cmd_buffer);
4266 
4267    genX(cmd_buffer_flush_state)(cmd_buffer);
4268 
4269    if (cmd_buffer->state.conditional_render_enabled)
4270       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4271 
4272    /* Our implementation of VK_KHR_multiview uses instancing to draw the
4273     * different views.  We need to multiply instanceCount by the view count.
4274     */
4275    if (!pipeline->use_primitive_replication)
4276       instanceCount *= anv_cmd_buffer_get_view_count(cmd_buffer);
4277 
4278    uint32_t i = 0;
4279    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
4280       cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4281                                                  draw->firstVertex,
4282                                                  firstInstance, i, !i);
4283 
4284       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4285          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4286          prim.VertexAccessType         = SEQUENTIAL;
4287          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4288          prim.VertexCountPerInstance   = draw->vertexCount;
4289          prim.StartVertexLocation      = draw->firstVertex;
4290          prim.InstanceCount            = instanceCount;
4291          prim.StartInstanceLocation    = firstInstance;
4292          prim.BaseVertexLocation       = 0;
4293       }
4294    }
4295 
4296    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4297 
4298    trace_intel_end_draw_multi(&cmd_buffer->trace, cmd_buffer, count);
4299 }
4300 
genX(CmdDrawIndexed)4301 void genX(CmdDrawIndexed)(
4302     VkCommandBuffer                             commandBuffer,
4303     uint32_t                                    indexCount,
4304     uint32_t                                    instanceCount,
4305     uint32_t                                    firstIndex,
4306     int32_t                                     vertexOffset,
4307     uint32_t                                    firstInstance)
4308 {
4309    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4310    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4311    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4312 
4313    if (anv_batch_has_error(&cmd_buffer->batch))
4314       return;
4315 
4316    const uint32_t count = (indexCount *
4317                            instanceCount *
4318                            (pipeline->use_primitive_replication ?
4319                             1 : anv_cmd_buffer_get_view_count(cmd_buffer)));
4320    anv_measure_snapshot(cmd_buffer,
4321                         INTEL_SNAPSHOT_DRAW,
4322                         "draw indexed",
4323                         count);
4324    trace_intel_begin_draw_indexed(&cmd_buffer->trace, cmd_buffer);
4325 
4326    genX(cmd_buffer_flush_state)(cmd_buffer);
4327 
4328    if (cmd_buffer->state.conditional_render_enabled)
4329       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4330 
4331    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true);
4332 
4333    /* Our implementation of VK_KHR_multiview uses instancing to draw the
4334     * different views.  We need to multiply instanceCount by the view count.
4335     */
4336    if (!pipeline->use_primitive_replication)
4337       instanceCount *= anv_cmd_buffer_get_view_count(cmd_buffer);
4338 
4339    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4340       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4341       prim.VertexAccessType         = RANDOM;
4342       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4343       prim.VertexCountPerInstance   = indexCount;
4344       prim.StartVertexLocation      = firstIndex;
4345       prim.InstanceCount            = instanceCount;
4346       prim.StartInstanceLocation    = firstInstance;
4347       prim.BaseVertexLocation       = vertexOffset;
4348    }
4349 
4350    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4351 
4352    trace_intel_end_draw_indexed(&cmd_buffer->trace, cmd_buffer, count);
4353 }
4354 
genX(CmdDrawMultiIndexedEXT)4355 void genX(CmdDrawMultiIndexedEXT)(
4356     VkCommandBuffer                             commandBuffer,
4357     uint32_t                                    drawCount,
4358     const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
4359     uint32_t                                    instanceCount,
4360     uint32_t                                    firstInstance,
4361     uint32_t                                    stride,
4362     const int32_t                              *pVertexOffset)
4363 {
4364    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4365    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4366    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4367 
4368    if (anv_batch_has_error(&cmd_buffer->batch))
4369       return;
4370 
4371    const uint32_t count = (drawCount *
4372                            instanceCount *
4373                            (pipeline->use_primitive_replication ?
4374                             1 : anv_cmd_buffer_get_view_count(cmd_buffer)));
4375    anv_measure_snapshot(cmd_buffer,
4376                         INTEL_SNAPSHOT_DRAW,
4377                         "draw indexed_multi",
4378                         count);
4379    trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace, cmd_buffer);
4380 
4381    genX(cmd_buffer_flush_state)(cmd_buffer);
4382 
4383    if (cmd_buffer->state.conditional_render_enabled)
4384       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4385 
4386    /* Our implementation of VK_KHR_multiview uses instancing to draw the
4387     * different views.  We need to multiply instanceCount by the view count.
4388     */
4389    if (!pipeline->use_primitive_replication)
4390       instanceCount *= anv_cmd_buffer_get_view_count(cmd_buffer);
4391 
4392    uint32_t i = 0;
4393    if (pVertexOffset) {
4394       if (vs_prog_data->uses_drawid) {
4395          bool emitted = true;
4396          if (vs_prog_data->uses_firstvertex ||
4397              vs_prog_data->uses_baseinstance) {
4398             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
4399             emitted = true;
4400          }
4401          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4402             if (vs_prog_data->uses_drawid) {
4403                emit_draw_index(cmd_buffer, i);
4404                emitted = true;
4405             }
4406             /* Emitting draw index or vertex index BOs may result in needing
4407              * additional VF cache flushes.
4408              */
4409             if (emitted)
4410                genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4411 
4412             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4413                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4414                prim.VertexAccessType         = RANDOM;
4415                prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4416                prim.VertexCountPerInstance   = draw->indexCount;
4417                prim.StartVertexLocation      = draw->firstIndex;
4418                prim.InstanceCount            = instanceCount;
4419                prim.StartInstanceLocation    = firstInstance;
4420                prim.BaseVertexLocation       = *pVertexOffset;
4421             }
4422             emitted = false;
4423          }
4424       } else {
4425          if (vs_prog_data->uses_firstvertex ||
4426              vs_prog_data->uses_baseinstance) {
4427             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
4428             /* Emitting draw index or vertex index BOs may result in needing
4429              * additional VF cache flushes.
4430              */
4431             genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4432          }
4433          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4434             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4435                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4436                prim.VertexAccessType         = RANDOM;
4437                prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4438                prim.VertexCountPerInstance   = draw->indexCount;
4439                prim.StartVertexLocation      = draw->firstIndex;
4440                prim.InstanceCount            = instanceCount;
4441                prim.StartInstanceLocation    = firstInstance;
4442                prim.BaseVertexLocation       = *pVertexOffset;
4443             }
4444          }
4445       }
4446    } else {
4447       vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4448          cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4449                                                     draw->vertexOffset,
4450                                                     firstInstance, i, i != 0);
4451 
4452          anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4453             prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4454             prim.VertexAccessType         = RANDOM;
4455             prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4456             prim.VertexCountPerInstance   = draw->indexCount;
4457             prim.StartVertexLocation      = draw->firstIndex;
4458             prim.InstanceCount            = instanceCount;
4459             prim.StartInstanceLocation    = firstInstance;
4460             prim.BaseVertexLocation       = draw->vertexOffset;
4461          }
4462       }
4463    }
4464 
4465    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4466 
4467    trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, cmd_buffer, count);
4468 }
4469 
4470 /* Auto-Draw / Indirect Registers */
4471 #define GFX7_3DPRIM_END_OFFSET          0x2420
4472 #define GFX7_3DPRIM_START_VERTEX        0x2430
4473 #define GFX7_3DPRIM_VERTEX_COUNT        0x2434
4474 #define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
4475 #define GFX7_3DPRIM_START_INSTANCE      0x243C
4476 #define GFX7_3DPRIM_BASE_VERTEX         0x2440
4477 
genX(CmdDrawIndirectByteCountEXT)4478 void genX(CmdDrawIndirectByteCountEXT)(
4479     VkCommandBuffer                             commandBuffer,
4480     uint32_t                                    instanceCount,
4481     uint32_t                                    firstInstance,
4482     VkBuffer                                    counterBuffer,
4483     VkDeviceSize                                counterBufferOffset,
4484     uint32_t                                    counterOffset,
4485     uint32_t                                    vertexStride)
4486 {
4487 #if GFX_VERx10 >= 75
4488    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4489    ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
4490    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4491    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4492 
4493    /* firstVertex is always zero for this draw function */
4494    const uint32_t firstVertex = 0;
4495 
4496    if (anv_batch_has_error(&cmd_buffer->batch))
4497       return;
4498 
4499    anv_measure_snapshot(cmd_buffer,
4500                         INTEL_SNAPSHOT_DRAW,
4501                         "draw indirect byte count",
4502                         instanceCount);
4503    trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace, cmd_buffer);
4504 
4505    genX(cmd_buffer_flush_state)(cmd_buffer);
4506 
4507    if (cmd_buffer->state.conditional_render_enabled)
4508       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4509 
4510    if (vs_prog_data->uses_firstvertex ||
4511        vs_prog_data->uses_baseinstance)
4512       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
4513    if (vs_prog_data->uses_drawid)
4514       emit_draw_index(cmd_buffer, 0);
4515 
4516    /* Emitting draw index or vertex index BOs may result in needing
4517     * additional VF cache flushes.
4518     */
4519    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4520 
4521    /* Our implementation of VK_KHR_multiview uses instancing to draw the
4522     * different views.  We need to multiply instanceCount by the view count.
4523     */
4524    if (!pipeline->use_primitive_replication)
4525       instanceCount *= anv_cmd_buffer_get_view_count(cmd_buffer);
4526 
4527    struct mi_builder b;
4528    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4529    struct mi_value count =
4530       mi_mem32(anv_address_add(counter_buffer->address,
4531                                    counterBufferOffset));
4532    if (counterOffset)
4533       count = mi_isub(&b, count, mi_imm(counterOffset));
4534    count = mi_udiv32_imm(&b, count, vertexStride);
4535    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
4536 
4537    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
4538    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), mi_imm(instanceCount));
4539    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
4540    mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
4541 
4542    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4543       prim.IndirectParameterEnable  = true;
4544       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4545       prim.VertexAccessType         = SEQUENTIAL;
4546       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4547    }
4548 
4549    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4550 
4551    trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace, cmd_buffer,
4552                                             instanceCount);
4553 #endif /* GFX_VERx10 >= 75 */
4554 }
4555 
4556 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed)4557 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
4558                          struct anv_address addr,
4559                          bool indexed)
4560 {
4561    struct mi_builder b;
4562    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4563 
4564    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
4565                 mi_mem32(anv_address_add(addr, 0)));
4566 
4567    struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
4568    unsigned view_count = anv_cmd_buffer_get_view_count(cmd_buffer);
4569    if (view_count > 1) {
4570 #if GFX_VERx10 >= 75
4571       instance_count = mi_imul_imm(&b, instance_count, view_count);
4572 #else
4573       anv_finishme("Multiview + indirect draw requires MI_MATH; "
4574                    "MI_MATH is not supported on Ivy Bridge");
4575 #endif
4576    }
4577    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
4578 
4579    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
4580                 mi_mem32(anv_address_add(addr, 8)));
4581 
4582    if (indexed) {
4583       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
4584                    mi_mem32(anv_address_add(addr, 12)));
4585       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
4586                    mi_mem32(anv_address_add(addr, 16)));
4587    } else {
4588       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
4589                    mi_mem32(anv_address_add(addr, 12)));
4590       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
4591    }
4592 }
4593 
genX(CmdDrawIndirect)4594 void genX(CmdDrawIndirect)(
4595     VkCommandBuffer                             commandBuffer,
4596     VkBuffer                                    _buffer,
4597     VkDeviceSize                                offset,
4598     uint32_t                                    drawCount,
4599     uint32_t                                    stride)
4600 {
4601    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4602    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4603    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4604    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4605 
4606    if (anv_batch_has_error(&cmd_buffer->batch))
4607       return;
4608 
4609    anv_measure_snapshot(cmd_buffer,
4610                         INTEL_SNAPSHOT_DRAW,
4611                         "draw indirect",
4612                         drawCount);
4613    trace_intel_begin_draw_indirect(&cmd_buffer->trace, cmd_buffer);
4614 
4615    genX(cmd_buffer_flush_state)(cmd_buffer);
4616 
4617    if (cmd_buffer->state.conditional_render_enabled)
4618       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4619 
4620    for (uint32_t i = 0; i < drawCount; i++) {
4621       struct anv_address draw = anv_address_add(buffer->address, offset);
4622 
4623       if (vs_prog_data->uses_firstvertex ||
4624           vs_prog_data->uses_baseinstance)
4625          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4626       if (vs_prog_data->uses_drawid)
4627          emit_draw_index(cmd_buffer, i);
4628 
4629       /* Emitting draw index or vertex index BOs may result in needing
4630        * additional VF cache flushes.
4631        */
4632       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4633 
4634       load_indirect_parameters(cmd_buffer, draw, false);
4635 
4636       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4637          prim.IndirectParameterEnable  = true;
4638          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4639          prim.VertexAccessType         = SEQUENTIAL;
4640          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4641       }
4642 
4643       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4644 
4645       offset += stride;
4646    }
4647 
4648    trace_intel_end_draw_indirect(&cmd_buffer->trace, cmd_buffer, drawCount);
4649 }
4650 
genX(CmdDrawIndexedIndirect)4651 void genX(CmdDrawIndexedIndirect)(
4652     VkCommandBuffer                             commandBuffer,
4653     VkBuffer                                    _buffer,
4654     VkDeviceSize                                offset,
4655     uint32_t                                    drawCount,
4656     uint32_t                                    stride)
4657 {
4658    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4659    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4660    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4661    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4662 
4663    if (anv_batch_has_error(&cmd_buffer->batch))
4664       return;
4665 
4666    anv_measure_snapshot(cmd_buffer,
4667                         INTEL_SNAPSHOT_DRAW,
4668                         "draw indexed indirect",
4669                         drawCount);
4670    trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace, cmd_buffer);
4671 
4672    genX(cmd_buffer_flush_state)(cmd_buffer);
4673 
4674    if (cmd_buffer->state.conditional_render_enabled)
4675       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4676 
4677    for (uint32_t i = 0; i < drawCount; i++) {
4678       struct anv_address draw = anv_address_add(buffer->address, offset);
4679 
4680       /* TODO: We need to stomp base vertex to 0 somehow */
4681       if (vs_prog_data->uses_firstvertex ||
4682           vs_prog_data->uses_baseinstance)
4683          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4684       if (vs_prog_data->uses_drawid)
4685          emit_draw_index(cmd_buffer, i);
4686 
4687       /* Emitting draw index or vertex index BOs may result in needing
4688        * additional VF cache flushes.
4689        */
4690       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4691 
4692       load_indirect_parameters(cmd_buffer, draw, true);
4693 
4694       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4695          prim.IndirectParameterEnable  = true;
4696          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4697          prim.VertexAccessType         = RANDOM;
4698          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4699       }
4700 
4701       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4702 
4703       offset += stride;
4704    }
4705 
4706    trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, cmd_buffer, drawCount);
4707 }
4708 
4709 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_buffer * count_buffer,uint64_t countBufferOffset)4710 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4711                                  struct mi_builder *b,
4712                                  struct anv_buffer *count_buffer,
4713                                  uint64_t countBufferOffset)
4714 {
4715    struct anv_address count_address =
4716          anv_address_add(count_buffer->address, countBufferOffset);
4717 
4718    struct mi_value ret = mi_imm(0);
4719 
4720    if (cmd_buffer->state.conditional_render_enabled) {
4721 #if GFX_VERx10 >= 75
4722       ret = mi_new_gpr(b);
4723       mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
4724 #endif
4725    } else {
4726       /* Upload the current draw count from the draw parameters buffer to
4727        * MI_PREDICATE_SRC0.
4728        */
4729       mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
4730       mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
4731    }
4732 
4733    return ret;
4734 }
4735 
4736 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)4737 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4738                           struct mi_builder *b,
4739                           uint32_t draw_index)
4740 {
4741    /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
4742    mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
4743 
4744    if (draw_index == 0) {
4745       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4746          mip.LoadOperation    = LOAD_LOADINV;
4747          mip.CombineOperation = COMBINE_SET;
4748          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4749       }
4750    } else {
4751       /* While draw_index < draw_count the predicate's result will be
4752        *  (draw_index == draw_count) ^ TRUE = TRUE
4753        * When draw_index == draw_count the result is
4754        *  (TRUE) ^ TRUE = FALSE
4755        * After this all results will be:
4756        *  (FALSE) ^ FALSE = FALSE
4757        */
4758       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4759          mip.LoadOperation    = LOAD_LOAD;
4760          mip.CombineOperation = COMBINE_XOR;
4761          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4762       }
4763    }
4764 }
4765 
4766 #if GFX_VERx10 >= 75
4767 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4768 emit_draw_count_predicate_with_conditional_render(
4769                           struct anv_cmd_buffer *cmd_buffer,
4770                           struct mi_builder *b,
4771                           uint32_t draw_index,
4772                           struct mi_value max)
4773 {
4774    struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
4775    pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
4776 
4777 #if GFX_VER >= 8
4778    mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
4779 #else
4780    /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
4781     * so we emit MI_PREDICATE to set it.
4782     */
4783 
4784    mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
4785    mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4786 
4787    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4788       mip.LoadOperation    = LOAD_LOADINV;
4789       mip.CombineOperation = COMBINE_SET;
4790       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4791    }
4792 #endif
4793 }
4794 #endif
4795 
4796 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4797 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
4798                                struct mi_builder *b,
4799                                uint32_t draw_index,
4800                                struct mi_value max)
4801 {
4802 #if GFX_VERx10 >= 75
4803    if (cmd_buffer->state.conditional_render_enabled) {
4804       emit_draw_count_predicate_with_conditional_render(
4805             cmd_buffer, b, draw_index, mi_value_ref(b, max));
4806    } else {
4807       emit_draw_count_predicate(cmd_buffer, b, draw_index);
4808    }
4809 #else
4810    emit_draw_count_predicate(cmd_buffer, b, draw_index);
4811 #endif
4812 }
4813 
genX(CmdDrawIndirectCount)4814 void genX(CmdDrawIndirectCount)(
4815     VkCommandBuffer                             commandBuffer,
4816     VkBuffer                                    _buffer,
4817     VkDeviceSize                                offset,
4818     VkBuffer                                    _countBuffer,
4819     VkDeviceSize                                countBufferOffset,
4820     uint32_t                                    maxDrawCount,
4821     uint32_t                                    stride)
4822 {
4823    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4824    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4825    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4826    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4827    struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4828    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4829 
4830    if (anv_batch_has_error(&cmd_buffer->batch))
4831       return;
4832 
4833    anv_measure_snapshot(cmd_buffer,
4834                         INTEL_SNAPSHOT_DRAW,
4835                         "draw indirect count",
4836                         0);
4837    trace_intel_begin_draw_indirect_count(&cmd_buffer->trace, cmd_buffer);
4838 
4839    genX(cmd_buffer_flush_state)(cmd_buffer);
4840 
4841    struct mi_builder b;
4842    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4843    struct mi_value max =
4844       prepare_for_draw_count_predicate(cmd_buffer, &b,
4845                                        count_buffer, countBufferOffset);
4846 
4847    for (uint32_t i = 0; i < maxDrawCount; i++) {
4848       struct anv_address draw = anv_address_add(buffer->address, offset);
4849 
4850       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4851 
4852       if (vs_prog_data->uses_firstvertex ||
4853           vs_prog_data->uses_baseinstance)
4854          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4855       if (vs_prog_data->uses_drawid)
4856          emit_draw_index(cmd_buffer, i);
4857 
4858       /* Emitting draw index or vertex index BOs may result in needing
4859        * additional VF cache flushes.
4860        */
4861       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4862 
4863       load_indirect_parameters(cmd_buffer, draw, false);
4864 
4865       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4866          prim.IndirectParameterEnable  = true;
4867          prim.PredicateEnable          = true;
4868          prim.VertexAccessType         = SEQUENTIAL;
4869          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4870       }
4871 
4872       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4873 
4874       offset += stride;
4875    }
4876 
4877    mi_value_unref(&b, max);
4878 
4879    trace_intel_end_draw_indirect_count(&cmd_buffer->trace, cmd_buffer, maxDrawCount);
4880 }
4881 
genX(CmdDrawIndexedIndirectCount)4882 void genX(CmdDrawIndexedIndirectCount)(
4883     VkCommandBuffer                             commandBuffer,
4884     VkBuffer                                    _buffer,
4885     VkDeviceSize                                offset,
4886     VkBuffer                                    _countBuffer,
4887     VkDeviceSize                                countBufferOffset,
4888     uint32_t                                    maxDrawCount,
4889     uint32_t                                    stride)
4890 {
4891    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4892    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4893    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4894    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4895    struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4896    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4897 
4898    if (anv_batch_has_error(&cmd_buffer->batch))
4899       return;
4900 
4901    anv_measure_snapshot(cmd_buffer,
4902                         INTEL_SNAPSHOT_DRAW,
4903                         "draw indexed indirect count",
4904                         0);
4905    trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace, cmd_buffer);
4906 
4907    genX(cmd_buffer_flush_state)(cmd_buffer);
4908 
4909    struct mi_builder b;
4910    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4911    struct mi_value max =
4912       prepare_for_draw_count_predicate(cmd_buffer, &b,
4913                                        count_buffer, countBufferOffset);
4914 
4915    for (uint32_t i = 0; i < maxDrawCount; i++) {
4916       struct anv_address draw = anv_address_add(buffer->address, offset);
4917 
4918       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4919 
4920       /* TODO: We need to stomp base vertex to 0 somehow */
4921       if (vs_prog_data->uses_firstvertex ||
4922           vs_prog_data->uses_baseinstance)
4923          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4924       if (vs_prog_data->uses_drawid)
4925          emit_draw_index(cmd_buffer, i);
4926 
4927       /* Emitting draw index or vertex index BOs may result in needing
4928        * additional VF cache flushes.
4929        */
4930       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4931 
4932       load_indirect_parameters(cmd_buffer, draw, true);
4933 
4934       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4935          prim.IndirectParameterEnable  = true;
4936          prim.PredicateEnable          = true;
4937          prim.VertexAccessType         = RANDOM;
4938          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4939       }
4940 
4941       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4942 
4943       offset += stride;
4944    }
4945 
4946    mi_value_unref(&b, max);
4947 
4948    trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace,
4949                                                cmd_buffer, maxDrawCount);
4950 
4951 }
4952 
genX(CmdBeginTransformFeedbackEXT)4953 void genX(CmdBeginTransformFeedbackEXT)(
4954     VkCommandBuffer                             commandBuffer,
4955     uint32_t                                    firstCounterBuffer,
4956     uint32_t                                    counterBufferCount,
4957     const VkBuffer*                             pCounterBuffers,
4958     const VkDeviceSize*                         pCounterBufferOffsets)
4959 {
4960    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4961 
4962    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4963    assert(counterBufferCount <= MAX_XFB_BUFFERS);
4964    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4965 
4966    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4967     *
4968     *    "Ssoftware must ensure that no HW stream output operations can be in
4969     *    process or otherwise pending at the point that the MI_LOAD/STORE
4970     *    commands are processed. This will likely require a pipeline flush."
4971     */
4972    anv_add_pending_pipe_bits(cmd_buffer,
4973                              ANV_PIPE_CS_STALL_BIT,
4974                              "begin transform feedback");
4975    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4976 
4977    for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
4978       /* If we have a counter buffer, this is a resume so we need to load the
4979        * value into the streamout offset register.  Otherwise, this is a begin
4980        * and we need to reset it to zero.
4981        */
4982       if (pCounterBuffers &&
4983           idx >= firstCounterBuffer &&
4984           idx - firstCounterBuffer < counterBufferCount &&
4985           pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
4986          uint32_t cb_idx = idx - firstCounterBuffer;
4987          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4988          uint64_t offset = pCounterBufferOffsets ?
4989                            pCounterBufferOffsets[cb_idx] : 0;
4990 
4991          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
4992             lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4993             lrm.MemoryAddress    = anv_address_add(counter_buffer->address,
4994                                                    offset);
4995          }
4996       } else {
4997          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
4998             lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4999             lri.DataDWord        = 0;
5000          }
5001       }
5002    }
5003 
5004    cmd_buffer->state.xfb_enabled = true;
5005    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
5006 }
5007 
genX(CmdEndTransformFeedbackEXT)5008 void genX(CmdEndTransformFeedbackEXT)(
5009     VkCommandBuffer                             commandBuffer,
5010     uint32_t                                    firstCounterBuffer,
5011     uint32_t                                    counterBufferCount,
5012     const VkBuffer*                             pCounterBuffers,
5013     const VkDeviceSize*                         pCounterBufferOffsets)
5014 {
5015    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5016 
5017    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
5018    assert(counterBufferCount <= MAX_XFB_BUFFERS);
5019    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
5020 
5021    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
5022     *
5023     *    "Ssoftware must ensure that no HW stream output operations can be in
5024     *    process or otherwise pending at the point that the MI_LOAD/STORE
5025     *    commands are processed. This will likely require a pipeline flush."
5026     */
5027    anv_add_pending_pipe_bits(cmd_buffer,
5028                              ANV_PIPE_CS_STALL_BIT,
5029                              "end transform feedback");
5030    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5031 
5032    for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
5033       unsigned idx = firstCounterBuffer + cb_idx;
5034 
5035       /* If we have a counter buffer, this is a resume so we need to load the
5036        * value into the streamout offset register.  Otherwise, this is a begin
5037        * and we need to reset it to zero.
5038        */
5039       if (pCounterBuffers &&
5040           cb_idx < counterBufferCount &&
5041           pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
5042          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
5043          uint64_t offset = pCounterBufferOffsets ?
5044                            pCounterBufferOffsets[cb_idx] : 0;
5045 
5046          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
5047             srm.MemoryAddress    = anv_address_add(counter_buffer->address,
5048                                                    offset);
5049             srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
5050          }
5051       }
5052    }
5053 
5054    cmd_buffer->state.xfb_enabled = false;
5055    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
5056 }
5057 
5058 #if GFX_VERx10 >= 125
5059 void
genX(CmdDrawMeshTasksNV)5060 genX(CmdDrawMeshTasksNV)(
5061     VkCommandBuffer                             commandBuffer,
5062     uint32_t                                    taskCount,
5063     uint32_t                                    firstTask)
5064 {
5065    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5066 
5067    if (anv_batch_has_error(&cmd_buffer->batch))
5068       return;
5069 
5070    /* TODO(mesh): Check if this is not emitting more packets than we need. */
5071    genX(cmd_buffer_flush_state)(cmd_buffer);
5072 
5073    if (cmd_buffer->state.conditional_render_enabled)
5074       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5075 
5076    /* BSpec 54016 says: "The values passed for Starting ThreadGroup ID X
5077     * and ThreadGroup Count X shall not cause TGIDs to exceed (2^32)-1."
5078     */
5079    assert((int64_t)firstTask + taskCount - 1 <= UINT32_MAX);
5080 
5081    anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_1D), m) {
5082       m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
5083       m.ThreadGroupCountX = taskCount;
5084       m.StartingThreadGroupIDX = firstTask;
5085    }
5086 }
5087 
5088 #define GFX125_3DMESH_TG_COUNT 0x26F0
5089 #define GFX125_3DMESH_STARTING_TGID 0x26F4
5090 #define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
5091 
5092 static void
mesh_load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address addr,bool emit_xp0,uint32_t xp0)5093 mesh_load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
5094                               struct mi_builder *b,
5095                               struct anv_address addr,
5096                               bool emit_xp0,
5097                               uint32_t xp0)
5098 {
5099    const size_t taskCountOff = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
5100    const size_t firstTaskOff = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask);
5101 
5102    mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
5103                mi_mem32(anv_address_add(addr, taskCountOff)));
5104 
5105    mi_store(b, mi_reg32(GFX125_3DMESH_STARTING_TGID),
5106                mi_mem32(anv_address_add(addr, firstTaskOff)));
5107 
5108    if (emit_xp0)
5109       mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
5110 }
5111 
5112 static void
emit_indirect_3dmesh_1d(struct anv_batch * batch,bool predicate_enable,bool uses_drawid)5113 emit_indirect_3dmesh_1d(struct anv_batch *batch,
5114                         bool predicate_enable,
5115                         bool uses_drawid)
5116 {
5117    uint32_t len = GENX(3DMESH_1D_length) + uses_drawid;
5118    anv_batch_emitn(batch, len, GENX(3DMESH_1D),
5119                    .PredicateEnable           = predicate_enable,
5120                    .IndirectParameterEnable   = true,
5121                    .ExtendedParameter0Present = uses_drawid);
5122 }
5123 
5124 void
genX(CmdDrawMeshTasksIndirectNV)5125 genX(CmdDrawMeshTasksIndirectNV)(
5126     VkCommandBuffer                             commandBuffer,
5127     VkBuffer                                    _buffer,
5128     VkDeviceSize                                offset,
5129     uint32_t                                    drawCount,
5130     uint32_t                                    stride)
5131 {
5132    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5133    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5134    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
5135    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
5136    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
5137    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5138 
5139    if (anv_batch_has_error(&cmd_buffer->batch))
5140       return;
5141 
5142    genX(cmd_buffer_flush_state)(cmd_buffer);
5143 
5144    if (cmd_state->conditional_render_enabled)
5145       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5146 
5147    bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
5148                        mesh_prog_data->uses_drawid;
5149    struct mi_builder b;
5150    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5151 
5152    for (uint32_t i = 0; i < drawCount; i++) {
5153       struct anv_address draw = anv_address_add(buffer->address, offset);
5154 
5155       mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i);
5156 
5157       emit_indirect_3dmesh_1d(&cmd_buffer->batch,
5158             cmd_state->conditional_render_enabled, uses_drawid);
5159 
5160       offset += stride;
5161    }
5162 }
5163 
5164 void
genX(CmdDrawMeshTasksIndirectCountNV)5165 genX(CmdDrawMeshTasksIndirectCountNV)(
5166     VkCommandBuffer                             commandBuffer,
5167     VkBuffer                                    _buffer,
5168     VkDeviceSize                                offset,
5169     VkBuffer                                    _countBuffer,
5170     VkDeviceSize                                countBufferOffset,
5171     uint32_t                                    maxDrawCount,
5172     uint32_t                                    stride)
5173 {
5174    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5175    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5176    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
5177    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
5178    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
5179    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
5180 
5181    if (anv_batch_has_error(&cmd_buffer->batch))
5182       return;
5183 
5184    genX(cmd_buffer_flush_state)(cmd_buffer);
5185 
5186    bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
5187                        mesh_prog_data->uses_drawid;
5188 
5189    struct mi_builder b;
5190    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5191 
5192    struct mi_value max =
5193          prepare_for_draw_count_predicate(cmd_buffer, &b,
5194                                           count_buffer, countBufferOffset);
5195 
5196    for (uint32_t i = 0; i < maxDrawCount; i++) {
5197       struct anv_address draw = anv_address_add(buffer->address, offset);
5198 
5199       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
5200 
5201       mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i);
5202 
5203       emit_indirect_3dmesh_1d(&cmd_buffer->batch, true, uses_drawid);
5204 
5205       offset += stride;
5206    }
5207 }
5208 #endif /* GFX_VERx10 >= 125 */
5209 
5210 void
genX(cmd_buffer_flush_compute_state)5211 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
5212 {
5213    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
5214    struct anv_compute_pipeline *pipeline = comp_state->pipeline;
5215 
5216    assert(pipeline->cs);
5217 
5218    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
5219 
5220    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
5221 
5222    /* Apply any pending pipeline flushes we may have.  We want to apply them
5223     * now because, if any of those flushes are for things like push constants,
5224     * the GPU will read the state at weird times.
5225     */
5226    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5227 
5228    if (cmd_buffer->state.compute.pipeline_dirty) {
5229       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
5230        *
5231        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
5232        *    the only bits that are changed are scoreboard related: Scoreboard
5233        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
5234        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
5235        *    sufficient."
5236        */
5237       anv_add_pending_pipe_bits(cmd_buffer,
5238                               ANV_PIPE_CS_STALL_BIT,
5239                               "flush compute state");
5240       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5241 
5242       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
5243 
5244       /* The workgroup size of the pipeline affects our push constant layout
5245        * so flag push constants as dirty if we change the pipeline.
5246        */
5247       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5248    }
5249 
5250    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
5251        cmd_buffer->state.compute.pipeline_dirty) {
5252       flush_descriptor_sets(cmd_buffer,
5253                             &cmd_buffer->state.compute.base,
5254                             VK_SHADER_STAGE_COMPUTE_BIT,
5255                             &pipeline->cs, 1);
5256       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
5257 
5258 #if GFX_VERx10 < 125
5259       uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
5260       struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
5261          .BindingTablePointer =
5262             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
5263          .SamplerStatePointer =
5264             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
5265       };
5266       GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
5267 
5268       struct anv_state state =
5269          anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
5270                                       pipeline->interface_descriptor_data,
5271                                       GENX(INTERFACE_DESCRIPTOR_DATA_length),
5272                                       64);
5273 
5274       uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
5275       anv_batch_emit(&cmd_buffer->batch,
5276                      GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
5277          mid.InterfaceDescriptorTotalLength        = size;
5278          mid.InterfaceDescriptorDataStartAddress   = state.offset;
5279       }
5280 #endif
5281    }
5282 
5283    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
5284       comp_state->push_data =
5285          anv_cmd_buffer_cs_push_constants(cmd_buffer);
5286 
5287 #if GFX_VERx10 < 125
5288       if (comp_state->push_data.alloc_size) {
5289          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
5290             curbe.CURBETotalDataLength    = comp_state->push_data.alloc_size;
5291             curbe.CURBEDataStartAddress   = comp_state->push_data.offset;
5292          }
5293       }
5294 #endif
5295 
5296       cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
5297    }
5298 
5299    cmd_buffer->state.compute.pipeline_dirty = false;
5300 
5301    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5302 }
5303 
5304 #if GFX_VER == 7
5305 
5306 static VkResult
verify_cmd_parser(const struct anv_device * device,int required_version,const char * function)5307 verify_cmd_parser(const struct anv_device *device,
5308                   int required_version,
5309                   const char *function)
5310 {
5311    if (device->physical->cmd_parser_version < required_version) {
5312       return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT,
5313                        "cmd parser version %d is required for %s",
5314                        required_version, function);
5315    } else {
5316       return VK_SUCCESS;
5317    }
5318 }
5319 
5320 #endif
5321 
5322 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)5323 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
5324                                   uint32_t baseGroupX,
5325                                   uint32_t baseGroupY,
5326                                   uint32_t baseGroupZ)
5327 {
5328    if (anv_batch_has_error(&cmd_buffer->batch))
5329       return;
5330 
5331    struct anv_push_constants *push =
5332       &cmd_buffer->state.compute.base.push_constants;
5333    if (push->cs.base_work_group_id[0] != baseGroupX ||
5334        push->cs.base_work_group_id[1] != baseGroupY ||
5335        push->cs.base_work_group_id[2] != baseGroupZ) {
5336       push->cs.base_work_group_id[0] = baseGroupX;
5337       push->cs.base_work_group_id[1] = baseGroupY;
5338       push->cs.base_work_group_id[2] = baseGroupZ;
5339 
5340       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5341    }
5342 }
5343 
genX(CmdDispatch)5344 void genX(CmdDispatch)(
5345     VkCommandBuffer                             commandBuffer,
5346     uint32_t                                    x,
5347     uint32_t                                    y,
5348     uint32_t                                    z)
5349 {
5350    genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z);
5351 }
5352 
5353 #if GFX_VERx10 >= 125
5354 
5355 static inline void
emit_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)5356 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
5357                     const struct anv_compute_pipeline *pipeline, bool indirect,
5358                     const struct brw_cs_prog_data *prog_data,
5359                     uint32_t groupCountX, uint32_t groupCountY,
5360                     uint32_t groupCountZ)
5361 {
5362    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
5363    const struct anv_shader_bin *cs_bin = pipeline->cs;
5364    bool predicate = cmd_buffer->state.conditional_render_enabled;
5365 
5366    const struct intel_device_info *devinfo = &pipeline->base.device->info;
5367    const struct brw_cs_dispatch_info dispatch =
5368       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
5369 
5370    anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
5371       cw.IndirectParameterEnable        = indirect;
5372       cw.PredicateEnable                = predicate;
5373       cw.SIMDSize                       = dispatch.simd_size / 16;
5374       cw.IndirectDataStartAddress       = comp_state->push_data.offset;
5375       cw.IndirectDataLength             = comp_state->push_data.alloc_size;
5376       cw.LocalXMaximum                  = prog_data->local_size[0] - 1;
5377       cw.LocalYMaximum                  = prog_data->local_size[1] - 1;
5378       cw.LocalZMaximum                  = prog_data->local_size[2] - 1;
5379       cw.ThreadGroupIDXDimension        = groupCountX;
5380       cw.ThreadGroupIDYDimension        = groupCountY;
5381       cw.ThreadGroupIDZDimension        = groupCountZ;
5382       cw.ExecutionMask                  = dispatch.right_mask;
5383       cw.PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0);
5384 
5385       cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
5386          .KernelStartPointer = cs_bin->kernel.offset,
5387          .SamplerStatePointer =
5388             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
5389          .BindingTablePointer =
5390             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
5391          .BindingTableEntryCount =
5392             1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
5393          .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
5394          .SharedLocalMemorySize = encode_slm_size(GFX_VER,
5395                                                   prog_data->base.total_shared),
5396          .NumberOfBarriers = prog_data->uses_barrier,
5397       };
5398    }
5399 }
5400 
5401 #else /* #if GFX_VERx10 >= 125 */
5402 
5403 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)5404 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
5405                   const struct anv_compute_pipeline *pipeline, bool indirect,
5406                   const struct brw_cs_prog_data *prog_data,
5407                   uint32_t groupCountX, uint32_t groupCountY,
5408                   uint32_t groupCountZ)
5409 {
5410    bool predicate = (GFX_VER <= 7 && indirect) ||
5411       cmd_buffer->state.conditional_render_enabled;
5412 
5413    const struct intel_device_info *devinfo = &pipeline->base.device->info;
5414    const struct brw_cs_dispatch_info dispatch =
5415       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
5416 
5417    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
5418       ggw.IndirectParameterEnable      = indirect;
5419       ggw.PredicateEnable              = predicate;
5420       ggw.SIMDSize                     = dispatch.simd_size / 16;
5421       ggw.ThreadDepthCounterMaximum    = 0;
5422       ggw.ThreadHeightCounterMaximum   = 0;
5423       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
5424       ggw.ThreadGroupIDXDimension      = groupCountX;
5425       ggw.ThreadGroupIDYDimension      = groupCountY;
5426       ggw.ThreadGroupIDZDimension      = groupCountZ;
5427       ggw.RightExecutionMask           = dispatch.right_mask;
5428       ggw.BottomExecutionMask          = 0xffffffff;
5429    }
5430 
5431    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
5432 }
5433 
5434 #endif /* #if GFX_VERx10 >= 125 */
5435 
5436 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)5437 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
5438                const struct anv_compute_pipeline *pipeline, bool indirect,
5439                const struct brw_cs_prog_data *prog_data,
5440                uint32_t groupCountX, uint32_t groupCountY,
5441                uint32_t groupCountZ)
5442 {
5443 #if GFX_VERx10 >= 125
5444    emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
5445                        groupCountY, groupCountZ);
5446 #else
5447    emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
5448                      groupCountY, groupCountZ);
5449 #endif
5450 }
5451 
genX(CmdDispatchBase)5452 void genX(CmdDispatchBase)(
5453     VkCommandBuffer                             commandBuffer,
5454     uint32_t                                    baseGroupX,
5455     uint32_t                                    baseGroupY,
5456     uint32_t                                    baseGroupZ,
5457     uint32_t                                    groupCountX,
5458     uint32_t                                    groupCountY,
5459     uint32_t                                    groupCountZ)
5460 {
5461    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5462    struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
5463    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
5464 
5465    anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
5466                                      baseGroupY, baseGroupZ);
5467 
5468    if (anv_batch_has_error(&cmd_buffer->batch))
5469       return;
5470 
5471    anv_measure_snapshot(cmd_buffer,
5472                         INTEL_SNAPSHOT_COMPUTE,
5473                         "compute",
5474                         groupCountX * groupCountY * groupCountZ *
5475                         prog_data->local_size[0] * prog_data->local_size[1] *
5476                         prog_data->local_size[2]);
5477 
5478    trace_intel_begin_compute(&cmd_buffer->trace, cmd_buffer);
5479 
5480    if (prog_data->uses_num_work_groups) {
5481       struct anv_state state =
5482          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
5483       uint32_t *sizes = state.map;
5484       sizes[0] = groupCountX;
5485       sizes[1] = groupCountY;
5486       sizes[2] = groupCountZ;
5487       cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
5488          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5489          .offset = state.offset,
5490       };
5491 
5492       /* The num_workgroups buffer goes in the binding table */
5493       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5494    }
5495 
5496    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
5497 
5498    if (cmd_buffer->state.conditional_render_enabled)
5499       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5500 
5501    emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
5502                   groupCountY, groupCountZ);
5503 
5504    trace_intel_end_compute(&cmd_buffer->trace, cmd_buffer,
5505                            groupCountX, groupCountY, groupCountZ);
5506 }
5507 
5508 #define GPGPU_DISPATCHDIMX 0x2500
5509 #define GPGPU_DISPATCHDIMY 0x2504
5510 #define GPGPU_DISPATCHDIMZ 0x2508
5511 
genX(CmdDispatchIndirect)5512 void genX(CmdDispatchIndirect)(
5513     VkCommandBuffer                             commandBuffer,
5514     VkBuffer                                    _buffer,
5515     VkDeviceSize                                offset)
5516 {
5517    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5518    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5519    struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
5520    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
5521    struct anv_address addr = anv_address_add(buffer->address, offset);
5522    UNUSED struct anv_batch *batch = &cmd_buffer->batch;
5523 
5524    anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
5525 
5526 #if GFX_VER == 7
5527    /* Linux 4.4 added command parser version 5 which allows the GPGPU
5528     * indirect dispatch registers to be written.
5529     */
5530    if (verify_cmd_parser(cmd_buffer->device, 5,
5531                          "vkCmdDispatchIndirect") != VK_SUCCESS)
5532       return;
5533 #endif
5534 
5535    anv_measure_snapshot(cmd_buffer,
5536                         INTEL_SNAPSHOT_COMPUTE,
5537                         "compute indirect",
5538                         0);
5539    trace_intel_begin_compute(&cmd_buffer->trace, cmd_buffer);
5540 
5541    if (prog_data->uses_num_work_groups) {
5542       cmd_buffer->state.compute.num_workgroups = addr;
5543 
5544       /* The num_workgroups buffer goes in the binding table */
5545       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5546    }
5547 
5548    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
5549 
5550    struct mi_builder b;
5551    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5552 
5553    struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
5554    struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
5555    struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
5556 
5557    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
5558    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
5559    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
5560 
5561 #if GFX_VER <= 7
5562    /* predicate = (compute_dispatch_indirect_x_size == 0); */
5563    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
5564    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5565    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5566       mip.LoadOperation    = LOAD_LOAD;
5567       mip.CombineOperation = COMBINE_SET;
5568       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5569    }
5570 
5571    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
5572    mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
5573    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5574       mip.LoadOperation    = LOAD_LOAD;
5575       mip.CombineOperation = COMBINE_OR;
5576       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5577    }
5578 
5579    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
5580    mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
5581    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5582       mip.LoadOperation    = LOAD_LOAD;
5583       mip.CombineOperation = COMBINE_OR;
5584       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5585    }
5586 
5587    /* predicate = !predicate; */
5588    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5589       mip.LoadOperation    = LOAD_LOADINV;
5590       mip.CombineOperation = COMBINE_OR;
5591       mip.CompareOperation = COMPARE_FALSE;
5592    }
5593 
5594 #if GFX_VERx10 == 75
5595    if (cmd_buffer->state.conditional_render_enabled) {
5596       /* predicate &= !(conditional_rendering_predicate == 0); */
5597       mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
5598                    mi_reg32(ANV_PREDICATE_RESULT_REG));
5599       anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5600          mip.LoadOperation    = LOAD_LOADINV;
5601          mip.CombineOperation = COMBINE_AND;
5602          mip.CompareOperation = COMPARE_SRCS_EQUAL;
5603       }
5604    }
5605 #endif
5606 
5607 #else /* GFX_VER > 7 */
5608    if (cmd_buffer->state.conditional_render_enabled)
5609       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5610 #endif
5611 
5612    emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
5613 
5614    trace_intel_end_compute(&cmd_buffer->trace, cmd_buffer, 0, 0, 0);
5615 }
5616 
5617 struct anv_state
genX(cmd_buffer_ray_query_globals)5618 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
5619 {
5620 #if GFX_VERx10 >= 125
5621    struct anv_device *device = cmd_buffer->device;
5622 
5623    struct anv_state state =
5624       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
5625                                          BRW_RT_DISPATCH_GLOBALS_SIZE,
5626                                          64);
5627    struct brw_rt_scratch_layout layout;
5628    uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
5629                                        * some cases?
5630                                        */
5631    brw_rt_compute_scratch_layout(&layout, &device->info,
5632                                  stack_ids_per_dss, 1 << 10);
5633 
5634    struct GFX_RT_DISPATCH_GLOBALS rtdg = {
5635       .MemBaseAddress = (struct anv_address) {
5636          /* The ray query HW computes offsets from the top of the buffer, so
5637           * let the address at the end of the buffer.
5638           */
5639          .bo = device->ray_query_bo,
5640          .offset = device->ray_query_bo->size
5641       },
5642       .AsyncRTStackSize = layout.ray_stack_stride / 64,
5643       .NumDSSRTStacks = layout.stack_ids_per_dss,
5644       .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
5645       .Flags = RT_DEPTH_TEST_LESS_EQUAL,
5646       .ResumeShaderTable = (struct anv_address) {
5647          .bo = cmd_buffer->state.ray_query_shadow_bo,
5648       },
5649    };
5650    GFX_RT_DISPATCH_GLOBALS_pack(NULL, state.map, &rtdg);
5651 
5652    return state;
5653 #else
5654    unreachable("Not supported");
5655 #endif
5656 }
5657 
5658 #if GFX_VERx10 >= 125
5659 static void
calc_local_trace_size(uint8_t local_shift[3],const uint32_t global[3])5660 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
5661 {
5662    unsigned total_shift = 0;
5663    memset(local_shift, 0, 3);
5664 
5665    bool progress;
5666    do {
5667       progress = false;
5668       for (unsigned i = 0; i < 3; i++) {
5669          assert(global[i] > 0);
5670          if ((1 << local_shift[i]) < global[i]) {
5671             progress = true;
5672             local_shift[i]++;
5673             total_shift++;
5674          }
5675 
5676          if (total_shift == 3)
5677             return;
5678       }
5679    } while(progress);
5680 
5681    /* Assign whatever's left to x */
5682    local_shift[0] += 3 - total_shift;
5683 }
5684 
5685 static struct GFX_RT_SHADER_TABLE
vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR * region)5686 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
5687 {
5688    return (struct GFX_RT_SHADER_TABLE) {
5689       .BaseAddress = anv_address_from_u64(region->deviceAddress),
5690       .Stride = region->stride,
5691    };
5692 }
5693 
5694 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer * cmd_buffer,const VkStridedDeviceAddressRegionKHR * raygen_sbt,const VkStridedDeviceAddressRegionKHR * miss_sbt,const VkStridedDeviceAddressRegionKHR * hit_sbt,const VkStridedDeviceAddressRegionKHR * callable_sbt,bool is_indirect,uint32_t launch_width,uint32_t launch_height,uint32_t launch_depth,uint64_t launch_size_addr)5695 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
5696                       const VkStridedDeviceAddressRegionKHR *raygen_sbt,
5697                       const VkStridedDeviceAddressRegionKHR *miss_sbt,
5698                       const VkStridedDeviceAddressRegionKHR *hit_sbt,
5699                       const VkStridedDeviceAddressRegionKHR *callable_sbt,
5700                       bool is_indirect,
5701                       uint32_t launch_width,
5702                       uint32_t launch_height,
5703                       uint32_t launch_depth,
5704                       uint64_t launch_size_addr)
5705 {
5706    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
5707    struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;
5708 
5709    if (anv_batch_has_error(&cmd_buffer->batch))
5710       return;
5711 
5712    /* If we have a known degenerate launch size, just bail */
5713    if (!is_indirect &&
5714        (launch_width == 0 || launch_height == 0 || launch_depth == 0))
5715       return;
5716 
5717    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
5718    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
5719 
5720    cmd_buffer->state.rt.pipeline_dirty = false;
5721 
5722    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5723 
5724    /* Add these to the reloc list as they're internal buffers that don't
5725     * actually have relocs to pick them up manually.
5726     *
5727     * TODO(RT): This is a bit of a hack
5728     */
5729    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
5730                          cmd_buffer->batch.alloc,
5731                          rt->scratch.bo);
5732 
5733    /* Allocate and set up our RT_DISPATCH_GLOBALS */
5734    struct anv_state rtdg_state =
5735       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
5736                                          BRW_RT_PUSH_CONST_OFFSET +
5737                                          sizeof(struct anv_push_constants),
5738                                          64);
5739 
5740    struct GFX_RT_DISPATCH_GLOBALS rtdg = {
5741       .MemBaseAddress = (struct anv_address) {
5742          .bo = rt->scratch.bo,
5743          .offset = rt->scratch.layout.ray_stack_start,
5744       },
5745       .CallStackHandler =
5746          anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0),
5747       .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
5748       .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
5749       .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
5750       .Flags = RT_DEPTH_TEST_LESS_EQUAL,
5751       .HitGroupTable = vk_sdar_to_shader_table(hit_sbt),
5752       .MissGroupTable = vk_sdar_to_shader_table(miss_sbt),
5753       .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
5754       .LaunchWidth = launch_width,
5755       .LaunchHeight = launch_height,
5756       .LaunchDepth = launch_depth,
5757       .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),
5758    };
5759    GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);
5760 
5761    /* Push constants go after the RT_DISPATCH_GLOBALS */
5762    assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);
5763    memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
5764           &cmd_buffer->state.rt.base.push_constants,
5765           sizeof(struct anv_push_constants));
5766 
5767    struct anv_address rtdg_addr = {
5768       .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5769       .offset = rtdg_state.offset,
5770    };
5771 
5772    uint8_t local_size_log2[3];
5773    uint32_t global_size[3] = {};
5774    if (is_indirect) {
5775       /* Pick a local size that's probably ok.  We assume most TraceRays calls
5776        * will use a two-dimensional dispatch size.  Worst case, our initial
5777        * dispatch will be a little slower than it has to be.
5778        */
5779       local_size_log2[0] = 2;
5780       local_size_log2[1] = 1;
5781       local_size_log2[2] = 0;
5782 
5783       struct mi_builder b;
5784       mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5785 
5786       struct mi_value launch_size[3] = {
5787          mi_mem32(anv_address_from_u64(launch_size_addr + 0)),
5788          mi_mem32(anv_address_from_u64(launch_size_addr + 4)),
5789          mi_mem32(anv_address_from_u64(launch_size_addr + 8)),
5790       };
5791 
5792       /* Store the original launch size into RT_DISPATCH_GLOBALS
5793        *
5794        * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets
5795        * moved into a genX version.
5796        */
5797       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),
5798                mi_value_ref(&b, launch_size[0]));
5799       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),
5800                mi_value_ref(&b, launch_size[1]));
5801       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),
5802                mi_value_ref(&b, launch_size[2]));
5803 
5804       /* Compute the global dispatch size */
5805       for (unsigned i = 0; i < 3; i++) {
5806          if (local_size_log2[i] == 0)
5807             continue;
5808 
5809          /* global_size = DIV_ROUND_UP(launch_size, local_size)
5810           *
5811           * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
5812           * has the semantics of shifting the enture 64-bit value and taking
5813           * the bottom 32 so we don't have to worry about roll-over.
5814           */
5815          uint32_t local_size = 1 << local_size_log2[i];
5816          launch_size[i] = mi_iadd(&b, launch_size[i],
5817                                       mi_imm(local_size - 1));
5818          launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
5819                                             local_size_log2[i]);
5820       }
5821 
5822       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
5823       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
5824       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
5825    } else {
5826       uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };
5827       calc_local_trace_size(local_size_log2, launch_size);
5828 
5829       for (unsigned i = 0; i < 3; i++) {
5830          /* We have to be a bit careful here because DIV_ROUND_UP adds to the
5831           * numerator value may overflow.  Cast to uint64_t to avoid this.
5832           */
5833          uint32_t local_size = 1 << local_size_log2[i];
5834          global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);
5835       }
5836    }
5837 
5838    anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
5839       cw.IndirectParameterEnable        = is_indirect;
5840       cw.PredicateEnable                = false;
5841       cw.SIMDSize                       = SIMD8;
5842       cw.LocalXMaximum                  = (1 << local_size_log2[0]) - 1;
5843       cw.LocalYMaximum                  = (1 << local_size_log2[1]) - 1;
5844       cw.LocalZMaximum                  = (1 << local_size_log2[2]) - 1;
5845       cw.ThreadGroupIDXDimension        = global_size[0];
5846       cw.ThreadGroupIDYDimension        = global_size[1];
5847       cw.ThreadGroupIDZDimension        = global_size[2];
5848       cw.ExecutionMask                  = 0xff;
5849       cw.EmitInlineParameter            = true;
5850       cw.PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0);
5851 
5852       const gl_shader_stage s = MESA_SHADER_RAYGEN;
5853       struct anv_device *device = cmd_buffer->device;
5854       struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
5855       struct anv_state *samplers = &cmd_buffer->state.samplers[s];
5856       cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
5857          .KernelStartPointer = device->rt_trampoline->kernel.offset,
5858          .SamplerStatePointer = samplers->offset,
5859          /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
5860          .SamplerCount = 0,
5861          .BindingTablePointer = surfaces->offset,
5862          .NumberofThreadsinGPGPUThreadGroup = 1,
5863          .BTDMode = true,
5864       };
5865 
5866       struct brw_rt_raygen_trampoline_params trampoline_params = {
5867          .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
5868          .raygen_bsr_addr = raygen_sbt->deviceAddress,
5869          .is_indirect = is_indirect,
5870          .local_group_size_log2 = {
5871             local_size_log2[0],
5872             local_size_log2[1],
5873             local_size_log2[2],
5874          },
5875       };
5876       STATIC_ASSERT(sizeof(trampoline_params) == 32);
5877       memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
5878    }
5879 }
5880 
5881 void
genX(CmdTraceRaysKHR)5882 genX(CmdTraceRaysKHR)(
5883     VkCommandBuffer                             commandBuffer,
5884     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
5885     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
5886     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
5887     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
5888     uint32_t                                    width,
5889     uint32_t                                    height,
5890     uint32_t                                    depth)
5891 {
5892    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5893 
5894    cmd_buffer_trace_rays(cmd_buffer,
5895                          pRaygenShaderBindingTable,
5896                          pMissShaderBindingTable,
5897                          pHitShaderBindingTable,
5898                          pCallableShaderBindingTable,
5899                          false /* is_indirect */,
5900                          width, height, depth,
5901                          0 /* launch_size_addr */);
5902 }
5903 
5904 void
genX(CmdTraceRaysIndirectKHR)5905 genX(CmdTraceRaysIndirectKHR)(
5906     VkCommandBuffer                             commandBuffer,
5907     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
5908     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
5909     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
5910     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
5911     VkDeviceAddress                             indirectDeviceAddress)
5912 {
5913    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5914 
5915    cmd_buffer_trace_rays(cmd_buffer,
5916                          pRaygenShaderBindingTable,
5917                          pMissShaderBindingTable,
5918                          pHitShaderBindingTable,
5919                          pCallableShaderBindingTable,
5920                          true /* is_indirect */,
5921                          0, 0, 0, /* width, height, depth, */
5922                          indirectDeviceAddress);
5923 }
5924 #endif /* GFX_VERx10 >= 125 */
5925 
5926 static void
genX(flush_pipeline_select)5927 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
5928                             uint32_t pipeline)
5929 {
5930    UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;
5931 
5932    if (cmd_buffer->state.current_pipeline == pipeline)
5933       return;
5934 
5935 #if GFX_VER >= 8 && GFX_VER < 10
5936    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
5937     *
5938     *   Software must clear the COLOR_CALC_STATE Valid field in
5939     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
5940     *   with Pipeline Select set to GPGPU.
5941     *
5942     * The internal hardware docs recommend the same workaround for Gfx9
5943     * hardware too.
5944     */
5945    if (pipeline == GPGPU)
5946       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
5947 #endif
5948 
5949 #if GFX_VER == 9
5950    if (pipeline == _3D) {
5951       /* There is a mid-object preemption workaround which requires you to
5952        * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D.  However,
5953        * even without preemption, we have issues with geometry flickering when
5954        * GPGPU and 3D are back-to-back and this seems to fix it.  We don't
5955        * really know why.
5956        */
5957       anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
5958          vfe.MaximumNumberofThreads =
5959             devinfo->max_cs_threads * devinfo->subslice_total - 1;
5960          vfe.NumberofURBEntries     = 2;
5961          vfe.URBEntryAllocationSize = 2;
5962       }
5963 
5964       /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
5965        * invalid. Set the compute pipeline to dirty to force a re-emit of the
5966        * pipeline in case we get back-to-back dispatch calls with the same
5967        * pipeline and a PIPELINE_SELECT in between.
5968        */
5969       cmd_buffer->state.compute.pipeline_dirty = true;
5970    }
5971 #endif
5972 
5973    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
5974     * PIPELINE_SELECT [DevBWR+]":
5975     *
5976     *   Project: DEVSNB+
5977     *
5978     *   Software must ensure all the write caches are flushed through a
5979     *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
5980     *   command to invalidate read only caches prior to programming
5981     *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
5982     *
5983     * Note the cmd_buffer_apply_pipe_flushes will split this into two
5984     * PIPE_CONTROLs.
5985     */
5986    anv_add_pending_pipe_bits(cmd_buffer,
5987                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
5988                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
5989                              ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
5990                              ANV_PIPE_CS_STALL_BIT |
5991                              ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5992                              ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
5993                              ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
5994                              ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT,
5995                              "flush and invalidate for PIPELINE_SELECT");
5996    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5997 
5998    anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
5999 #if GFX_VER >= 9
6000       ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
6001       ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
6002 #endif
6003       ps.PipelineSelection = pipeline;
6004    }
6005 
6006 #if GFX_VER == 9
6007    if (devinfo->platform == INTEL_PLATFORM_GLK) {
6008       /* Project: DevGLK
6009        *
6010        * "This chicken bit works around a hardware issue with barrier logic
6011        *  encountered when switching between GPGPU and 3D pipelines.  To
6012        *  workaround the issue, this mode bit should be set after a pipeline
6013        *  is selected."
6014        */
6015       anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
6016          scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
6017                                                   : GLK_BARRIER_MODE_3D_HULL;
6018          scec1.GLKBarrierModeMask = 1;
6019       }
6020    }
6021 #endif
6022 
6023    cmd_buffer->state.current_pipeline = pipeline;
6024 }
6025 
6026 void
genX(flush_pipeline_select_3d)6027 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
6028 {
6029    genX(flush_pipeline_select)(cmd_buffer, _3D);
6030 }
6031 
6032 void
genX(flush_pipeline_select_gpgpu)6033 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
6034 {
6035    genX(flush_pipeline_select)(cmd_buffer, GPGPU);
6036 }
6037 
6038 void
genX(cmd_buffer_emit_gfx7_depth_flush)6039 genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
6040 {
6041    if (GFX_VER >= 8)
6042       return;
6043 
6044    /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
6045     *
6046     *    "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
6047     *    combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
6048     *    3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
6049     *    issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
6050     *    set), followed by a pipelined depth cache flush (PIPE_CONTROL with
6051     *    Depth Flush Bit set, followed by another pipelined depth stall
6052     *    (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
6053     *    guarantee that the pipeline from WM onwards is already flushed (e.g.,
6054     *    via a preceding MI_FLUSH)."
6055     */
6056    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6057       pipe.DepthStallEnable = true;
6058       anv_debug_dump_pc(pipe);
6059    }
6060    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6061       pipe.DepthCacheFlushEnable = true;
6062 #if GFX_VER >= 12
6063       pipe.TileCacheFlushEnable = true;
6064 #endif
6065       anv_debug_dump_pc(pipe);
6066    }
6067    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6068       pipe.DepthStallEnable = true;
6069       anv_debug_dump_pc(pipe);
6070    }
6071 }
6072 
6073 void
genX(cmd_buffer_emit_gfx12_depth_wa)6074 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
6075                                      const struct isl_surf *surf)
6076 {
6077 #if GFX_VERx10 == 120
6078    const bool fmt_is_d16 = surf->format == ISL_FORMAT_R16_UNORM;
6079 
6080    switch (cmd_buffer->state.depth_reg_mode) {
6081    case ANV_DEPTH_REG_MODE_HW_DEFAULT:
6082       if (!fmt_is_d16)
6083          return;
6084       break;
6085    case ANV_DEPTH_REG_MODE_D16:
6086       if (fmt_is_d16)
6087          return;
6088       break;
6089    case ANV_DEPTH_REG_MODE_UNKNOWN:
6090       break;
6091    }
6092 
6093    /* We'll change some CHICKEN registers depending on the depth surface
6094     * format. Do a depth flush and stall so the pipeline is not using these
6095     * settings while we change the registers.
6096     */
6097    anv_add_pending_pipe_bits(cmd_buffer,
6098                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
6099                              ANV_PIPE_DEPTH_STALL_BIT |
6100                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
6101                              "Workaround: Stop pipeline for 14010455700");
6102    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6103 
6104    /* Wa_14010455700
6105     *
6106     * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
6107     * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
6108     */
6109    anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
6110       reg.HIZPlaneOptimizationdisablebit = fmt_is_d16 && surf->samples == 1;
6111       reg.HIZPlaneOptimizationdisablebitMask = true;
6112    }
6113 
6114    /* Wa_1806527549
6115     *
6116     * Set HIZ_CHICKEN (7018h) bit 13 = 1 when depth buffer is D16_UNORM.
6117     */
6118    anv_batch_write_reg(&cmd_buffer->batch, GENX(HIZ_CHICKEN), reg) {
6119       reg.HZDepthTestLEGEOptimizationDisable = fmt_is_d16;
6120       reg.HZDepthTestLEGEOptimizationDisableMask = true;
6121    }
6122 
6123    cmd_buffer->state.depth_reg_mode =
6124       fmt_is_d16 ? ANV_DEPTH_REG_MODE_D16 : ANV_DEPTH_REG_MODE_HW_DEFAULT;
6125 #endif
6126 }
6127 
6128 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
6129  *
6130  *    "The VF cache needs to be invalidated before binding and then using
6131  *    Vertex Buffers that overlap with any previously bound Vertex Buffer
6132  *    (at a 64B granularity) since the last invalidation.  A VF cache
6133  *    invalidate is performed by setting the "VF Cache Invalidation Enable"
6134  *    bit in PIPE_CONTROL."
6135  *
6136  * This is implemented by carefully tracking all vertex and index buffer
6137  * bindings and flushing if the cache ever ends up with a range in the cache
6138  * that would exceed 4 GiB.  This is implemented in three parts:
6139  *
6140  *    1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
6141  *       every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
6142  *       tracking code of the new binding.  If this new binding would cause
6143  *       the cache to have a too-large range on the next draw call, a pipeline
6144  *       stall and VF cache invalidate are added to pending_pipeline_bits.
6145  *
6146  *    2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
6147  *       empty whenever we emit a VF invalidate.
6148  *
6149  *    3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
6150  *       after every 3DPRIMITIVE and copies the bound range into the dirty
6151  *       range for each used buffer.  This has to be a separate step because
6152  *       we don't always re-bind all buffers and so 1. can't know which
6153  *       buffers are actually bound.
6154  */
6155 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)6156 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
6157                                                int vb_index,
6158                                                struct anv_address vb_address,
6159                                                uint32_t vb_size)
6160 {
6161    if (GFX_VER < 8 || GFX_VER > 9 ||
6162        anv_use_relocations(cmd_buffer->device->physical))
6163       return;
6164 
6165    struct anv_vb_cache_range *bound, *dirty;
6166    if (vb_index == -1) {
6167       bound = &cmd_buffer->state.gfx.ib_bound_range;
6168       dirty = &cmd_buffer->state.gfx.ib_dirty_range;
6169    } else {
6170       assert(vb_index >= 0);
6171       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
6172       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
6173       bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
6174       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
6175    }
6176 
6177    if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
6178                                                   vb_address,
6179                                                   vb_size)) {
6180       anv_add_pending_pipe_bits(cmd_buffer,
6181                                 ANV_PIPE_CS_STALL_BIT |
6182                                 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
6183                                 "vb > 32b range");
6184    }
6185 }
6186 
6187 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)6188 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
6189                                                     uint32_t access_type,
6190                                                     uint64_t vb_used)
6191 {
6192    if (GFX_VER < 8 || GFX_VER > 9 ||
6193        anv_use_relocations(cmd_buffer->device->physical))
6194       return;
6195 
6196    if (access_type == RANDOM) {
6197       /* We have an index buffer */
6198       struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
6199       struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
6200 
6201       if (bound->end > bound->start) {
6202          dirty->start = MIN2(dirty->start, bound->start);
6203          dirty->end = MAX2(dirty->end, bound->end);
6204       }
6205    }
6206 
6207    uint64_t mask = vb_used;
6208    while (mask) {
6209       int i = u_bit_scan64(&mask);
6210       assert(i >= 0);
6211       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
6212       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
6213 
6214       struct anv_vb_cache_range *bound, *dirty;
6215       bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
6216       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
6217 
6218       if (bound->end > bound->start) {
6219          dirty->start = MIN2(dirty->start, bound->start);
6220          dirty->end = MAX2(dirty->end, bound->end);
6221       }
6222    }
6223 }
6224 
6225 /**
6226  * Update the pixel hashing modes that determine the balancing of PS threads
6227  * across subslices and slices.
6228  *
6229  * \param width Width bound of the rendering area (already scaled down if \p
6230  *              scale is greater than 1).
6231  * \param height Height bound of the rendering area (already scaled down if \p
6232  *               scale is greater than 1).
6233  * \param scale The number of framebuffer samples that could potentially be
6234  *              affected by an individual channel of the PS thread.  This is
6235  *              typically one for single-sampled rendering, but for operations
6236  *              like CCS resolves and fast clears a single PS invocation may
6237  *              update a huge number of pixels, in which case a finer
6238  *              balancing is desirable in order to maximally utilize the
6239  *              bandwidth available.  UINT_MAX can be used as shorthand for
6240  *              "finest hashing mode available".
6241  */
6242 void
genX(cmd_buffer_emit_hashing_mode)6243 genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
6244                                    unsigned width, unsigned height,
6245                                    unsigned scale)
6246 {
6247 #if GFX_VER == 9
6248    const struct intel_device_info *devinfo = &cmd_buffer->device->info;
6249    const unsigned slice_hashing[] = {
6250       /* Because all Gfx9 platforms with more than one slice require
6251        * three-way subslice hashing, a single "normal" 16x16 slice hashing
6252        * block is guaranteed to suffer from substantial imbalance, with one
6253        * subslice receiving twice as much work as the other two in the
6254        * slice.
6255        *
6256        * The performance impact of that would be particularly severe when
6257        * three-way hashing is also in use for slice balancing (which is the
6258        * case for all Gfx9 GT4 platforms), because one of the slices
6259        * receives one every three 16x16 blocks in either direction, which
6260        * is roughly the periodicity of the underlying subslice imbalance
6261        * pattern ("roughly" because in reality the hardware's
6262        * implementation of three-way hashing doesn't do exact modulo 3
6263        * arithmetic, which somewhat decreases the magnitude of this effect
6264        * in practice).  This leads to a systematic subslice imbalance
6265        * within that slice regardless of the size of the primitive.  The
6266        * 32x32 hashing mode guarantees that the subslice imbalance within a
6267        * single slice hashing block is minimal, largely eliminating this
6268        * effect.
6269        */
6270       _32x32,
6271       /* Finest slice hashing mode available. */
6272       NORMAL
6273    };
6274    const unsigned subslice_hashing[] = {
6275       /* 16x16 would provide a slight cache locality benefit especially
6276        * visible in the sampler L1 cache efficiency of low-bandwidth
6277        * non-LLC platforms, but it comes at the cost of greater subslice
6278        * imbalance for primitives of dimensions approximately intermediate
6279        * between 16x4 and 16x16.
6280        */
6281       _16x4,
6282       /* Finest subslice hashing mode available. */
6283       _8x4
6284    };
6285    /* Dimensions of the smallest hashing block of a given hashing mode.  If
6286     * the rendering area is smaller than this there can't possibly be any
6287     * benefit from switching to this mode, so we optimize out the
6288     * transition.
6289     */
6290    const unsigned min_size[][2] = {
6291          { 16, 4 },
6292          { 8, 4 }
6293    };
6294    const unsigned idx = scale > 1;
6295 
6296    if (cmd_buffer->state.current_hash_scale != scale &&
6297        (width > min_size[idx][0] || height > min_size[idx][1])) {
6298       anv_add_pending_pipe_bits(cmd_buffer,
6299                                 ANV_PIPE_CS_STALL_BIT |
6300                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
6301                                 "change pixel hash mode");
6302       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6303 
6304       anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
6305          gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
6306          gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
6307          gt.SubsliceHashing = subslice_hashing[idx];
6308          gt.SubsliceHashingMask = -1;
6309       }
6310 
6311       cmd_buffer->state.current_hash_scale = scale;
6312    }
6313 #endif
6314 }
6315 
6316 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)6317 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
6318 {
6319    struct anv_device *device = cmd_buffer->device;
6320    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6321 
6322    /* FIXME: Width and Height are wrong */
6323 
6324    genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
6325 
6326    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
6327                                         device->isl_dev.ds.size / 4);
6328    if (dw == NULL)
6329       return;
6330 
6331    struct isl_depth_stencil_hiz_emit_info info = {
6332       .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
6333    };
6334 
6335    if (gfx->depth_att.iview != NULL) {
6336       info.view = &gfx->depth_att.iview->planes[0].isl;
6337    } else if (gfx->stencil_att.iview != NULL) {
6338       info.view = &gfx->stencil_att.iview->planes[0].isl;
6339    }
6340 
6341    if (gfx->depth_att.iview != NULL) {
6342       const struct anv_image_view *iview = gfx->depth_att.iview;
6343       const struct anv_image *image = iview->image;
6344 
6345       info.view = &iview->planes[0].isl;
6346 
6347       const uint32_t depth_plane =
6348          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
6349       const struct anv_surface *depth_surface =
6350          &image->planes[depth_plane].primary_surface;
6351       const struct anv_address depth_address =
6352          anv_image_address(image, &depth_surface->memory_range);
6353 
6354       info.depth_surf = &depth_surface->isl;
6355 
6356       info.depth_address =
6357          anv_batch_emit_reloc(&cmd_buffer->batch,
6358                               dw + device->isl_dev.ds.depth_offset / 4,
6359                               depth_address.bo, depth_address.offset);
6360       info.mocs =
6361          anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
6362 
6363       info.hiz_usage = gfx->depth_att.aux_usage;
6364       if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
6365          assert(isl_aux_usage_has_hiz(info.hiz_usage));
6366 
6367          const struct anv_surface *hiz_surface =
6368             &image->planes[depth_plane].aux_surface;
6369          const struct anv_address hiz_address =
6370             anv_image_address(image, &hiz_surface->memory_range);
6371 
6372          info.hiz_surf = &hiz_surface->isl;
6373 
6374          info.hiz_address =
6375             anv_batch_emit_reloc(&cmd_buffer->batch,
6376                                  dw + device->isl_dev.ds.hiz_offset / 4,
6377                                  hiz_address.bo, hiz_address.offset);
6378 
6379          info.depth_clear_value = ANV_HZ_FC_VAL;
6380       }
6381    }
6382 
6383    if (gfx->stencil_att.iview != NULL) {
6384       const struct anv_image_view *iview = gfx->stencil_att.iview;
6385       const struct anv_image *image = iview->image;
6386 
6387       if (info.view == NULL)
6388          info.view = &iview->planes[0].isl;
6389 
6390       const uint32_t stencil_plane =
6391          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
6392       const struct anv_surface *stencil_surface =
6393          &image->planes[stencil_plane].primary_surface;
6394       const struct anv_address stencil_address =
6395          anv_image_address(image, &stencil_surface->memory_range);
6396 
6397       info.stencil_surf = &stencil_surface->isl;
6398 
6399       info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
6400       info.stencil_address =
6401          anv_batch_emit_reloc(&cmd_buffer->batch,
6402                               dw + device->isl_dev.ds.stencil_offset / 4,
6403                               stencil_address.bo, stencil_address.offset);
6404       info.mocs =
6405          anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
6406    }
6407 
6408    isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
6409 
6410    if (info.depth_surf)
6411       genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
6412 
6413    if (GFX_VER >= 12) {
6414       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
6415       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6416 
6417       /* Wa_1408224581
6418        *
6419        * Workaround: Gfx12LP Astep only An additional pipe control with
6420        * post-sync = store dword operation would be required.( w/a is to
6421        * have an additional pipe control after the stencil state whenever
6422        * the surface state bits of this state is changing).
6423        *
6424        * This also seems sufficient to handle Wa_14014148106.
6425        */
6426       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
6427          pc.PostSyncOperation = WriteImmediateData;
6428          pc.Address = cmd_buffer->device->workaround_address;
6429       }
6430    }
6431    cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
6432 }
6433 
6434 static void
cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image_view * fsr_iview)6435 cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
6436                                    const struct anv_image_view *fsr_iview)
6437 {
6438 #if GFX_VERx10 >= 125
6439    struct anv_device *device = cmd_buffer->device;
6440 
6441    if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
6442       return;
6443 
6444    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
6445                                         device->isl_dev.cpb.size / 4);
6446    if (dw == NULL)
6447       return;
6448 
6449    struct isl_cpb_emit_info info = { };
6450 
6451    if (fsr_iview) {
6452       info.view = &fsr_iview->planes[0].isl;
6453       info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
6454       info.address =
6455          anv_batch_emit_reloc(&cmd_buffer->batch,
6456                               dw + device->isl_dev.cpb.offset / 4,
6457                               fsr_iview->image->bindings[0].address.bo,
6458                               fsr_iview->image->bindings[0].address.offset +
6459                               fsr_iview->image->bindings[0].memory_range.offset);
6460       info.mocs =
6461          anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
6462                   ISL_SURF_USAGE_CPB_BIT);
6463    }
6464 
6465    isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
6466 #endif /* GFX_VERx10 >= 125 */
6467 }
6468 
6469 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)6470 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
6471 {
6472    const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
6473       vk_find_struct_const(att->pNext,
6474                            RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
6475    if (layout_info != NULL)
6476       return layout_info->initialLayout;
6477 
6478    return att->imageLayout;
6479 }
6480 
genX(CmdBeginRendering)6481 void genX(CmdBeginRendering)(
6482     VkCommandBuffer                             commandBuffer,
6483     const VkRenderingInfo*                      pRenderingInfo)
6484 {
6485    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6486    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6487    VkResult result;
6488 
6489    if (!is_render_queue_cmd_buffer(cmd_buffer)) {
6490       assert(!"Trying to start a render pass on non-render queue!");
6491       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
6492       return;
6493    }
6494 
6495    anv_measure_beginrenderpass(cmd_buffer);
6496    trace_intel_begin_render_pass(&cmd_buffer->trace, cmd_buffer);
6497 
6498    gfx->rendering_flags = pRenderingInfo->flags;
6499    gfx->render_area = pRenderingInfo->renderArea;
6500    gfx->view_mask = pRenderingInfo->viewMask;
6501    gfx->layer_count = pRenderingInfo->layerCount;
6502    gfx->samples = 0;
6503 
6504    const bool is_multiview = gfx->view_mask != 0;
6505    const VkRect2D render_area = gfx->render_area;
6506    const uint32_t layers =
6507       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
6508 
6509    /* The framebuffer size is at least large enough to contain the render
6510     * area.  Because a zero renderArea is possible, we MAX with 1.
6511     */
6512    struct isl_extent3d fb_size = {
6513       .w = MAX2(1, render_area.offset.x + render_area.extent.width),
6514       .h = MAX2(1, render_area.offset.y + render_area.extent.height),
6515       .d = layers,
6516    };
6517 
6518    /* Reserve one for the NULL state. */
6519    uint32_t color_att_valid = 0;
6520    uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
6521    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
6522       if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
6523          color_att_valid |= BITFIELD_BIT(i);
6524    }
6525    result = anv_cmd_buffer_init_attachments(cmd_buffer,
6526                                             color_att_count,
6527                                             color_att_valid);
6528    if (result != VK_SUCCESS)
6529       return;
6530 
6531    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
6532       if (!(color_att_valid & BITFIELD_BIT(i)))
6533          continue;
6534 
6535       const VkRenderingAttachmentInfo *att =
6536          &pRenderingInfo->pColorAttachments[i];
6537       ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
6538       const VkImageLayout initial_layout = attachment_initial_layout(att);
6539 
6540       assert(render_area.offset.x + render_area.extent.width <=
6541              iview->vk.extent.width);
6542       assert(render_area.offset.y + render_area.extent.height <=
6543              iview->vk.extent.height);
6544       assert(layers <= iview->vk.layer_count);
6545 
6546       fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
6547       fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
6548 
6549       assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
6550       gfx->samples |= iview->vk.image->samples;
6551 
6552       enum isl_aux_usage aux_usage =
6553          anv_layout_to_aux_usage(&cmd_buffer->device->info,
6554                                  iview->image,
6555                                  VK_IMAGE_ASPECT_COLOR_BIT,
6556                                  VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
6557                                  att->imageLayout);
6558 
6559       union isl_color_value fast_clear_color = { .u32 = { 0, } };
6560 
6561       if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6562           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
6563          const union isl_color_value clear_color =
6564             vk_to_isl_color_with_format(att->clearValue.color,
6565                                         iview->planes[0].isl.format);
6566 
6567          /* We only support fast-clears on the first layer */
6568          const bool fast_clear =
6569             (!is_multiview || (gfx->view_mask & 1)) &&
6570             anv_can_fast_clear_color_view(cmd_buffer->device, iview,
6571                                           att->imageLayout, clear_color,
6572                                           layers, render_area);
6573 
6574          if (att->imageLayout != initial_layout) {
6575             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6576                    render_area.extent.width == iview->vk.extent.width &&
6577                    render_area.extent.height == iview->vk.extent.height);
6578             if (is_multiview) {
6579                u_foreach_bit(view, gfx->view_mask) {
6580                   transition_color_buffer(cmd_buffer, iview->image,
6581                                           VK_IMAGE_ASPECT_COLOR_BIT,
6582                                           iview->vk.base_mip_level, 1,
6583                                           iview->vk.base_array_layer + view,
6584                                           1, /* layer_count */
6585                                           initial_layout, att->imageLayout,
6586                                           VK_QUEUE_FAMILY_IGNORED,
6587                                           VK_QUEUE_FAMILY_IGNORED,
6588                                           fast_clear);
6589                }
6590             } else {
6591                transition_color_buffer(cmd_buffer, iview->image,
6592                                        VK_IMAGE_ASPECT_COLOR_BIT,
6593                                        iview->vk.base_mip_level, 1,
6594                                        iview->vk.base_array_layer,
6595                                        gfx->layer_count,
6596                                        initial_layout, att->imageLayout,
6597                                        VK_QUEUE_FAMILY_IGNORED,
6598                                        VK_QUEUE_FAMILY_IGNORED,
6599                                        fast_clear);
6600             }
6601          }
6602 
6603          uint32_t clear_view_mask = pRenderingInfo->viewMask;
6604          uint32_t base_clear_layer = iview->vk.base_array_layer;
6605          uint32_t clear_layer_count = gfx->layer_count;
6606          if (fast_clear) {
6607             /* We only support fast-clears on the first layer */
6608             assert(iview->vk.base_mip_level == 0 &&
6609                    iview->vk.base_array_layer == 0);
6610 
6611             fast_clear_color = clear_color;
6612 
6613             if (iview->image->vk.samples == 1) {
6614                anv_image_ccs_op(cmd_buffer, iview->image,
6615                                 iview->planes[0].isl.format,
6616                                 iview->planes[0].isl.swizzle,
6617                                 VK_IMAGE_ASPECT_COLOR_BIT,
6618                                 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
6619                                 &fast_clear_color,
6620                                 false);
6621             } else {
6622                anv_image_mcs_op(cmd_buffer, iview->image,
6623                                 iview->planes[0].isl.format,
6624                                 iview->planes[0].isl.swizzle,
6625                                 VK_IMAGE_ASPECT_COLOR_BIT,
6626                                 0, 1, ISL_AUX_OP_FAST_CLEAR,
6627                                 &fast_clear_color,
6628                                 false);
6629             }
6630             clear_view_mask &= ~1u;
6631             base_clear_layer++;
6632             clear_layer_count--;
6633 
6634             if (isl_color_value_is_zero(clear_color,
6635                                         iview->planes[0].isl.format)) {
6636                /* This image has the auxiliary buffer enabled. We can mark the
6637                 * subresource as not needing a resolve because the clear color
6638                 * will match what's in every RENDER_SURFACE_STATE object when
6639                 * it's being used for sampling.
6640                 */
6641                set_image_fast_clear_state(cmd_buffer, iview->image,
6642                                           VK_IMAGE_ASPECT_COLOR_BIT,
6643                                           ANV_FAST_CLEAR_DEFAULT_VALUE);
6644             } else {
6645                set_image_fast_clear_state(cmd_buffer, iview->image,
6646                                           VK_IMAGE_ASPECT_COLOR_BIT,
6647                                           ANV_FAST_CLEAR_ANY);
6648             }
6649          }
6650 
6651          if (is_multiview) {
6652             u_foreach_bit(view, clear_view_mask) {
6653                anv_image_clear_color(cmd_buffer, iview->image,
6654                                      VK_IMAGE_ASPECT_COLOR_BIT,
6655                                      aux_usage,
6656                                      iview->planes[0].isl.format,
6657                                      iview->planes[0].isl.swizzle,
6658                                      iview->vk.base_mip_level,
6659                                      iview->vk.base_array_layer + view, 1,
6660                                      render_area, clear_color);
6661             }
6662          } else {
6663             anv_image_clear_color(cmd_buffer, iview->image,
6664                                   VK_IMAGE_ASPECT_COLOR_BIT,
6665                                   aux_usage,
6666                                   iview->planes[0].isl.format,
6667                                   iview->planes[0].isl.swizzle,
6668                                   iview->vk.base_mip_level,
6669                                   base_clear_layer, clear_layer_count,
6670                                   render_area, clear_color);
6671          }
6672       } else {
6673          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
6674          assert(att->imageLayout == initial_layout);
6675       }
6676 
6677       gfx->color_att[i].vk_format = iview->vk.format;
6678       gfx->color_att[i].iview = iview;
6679       gfx->color_att[i].layout = att->imageLayout;
6680       gfx->color_att[i].aux_usage = aux_usage;
6681 
6682       anv_image_fill_surface_state(cmd_buffer->device,
6683                                    iview->image,
6684                                    VK_IMAGE_ASPECT_COLOR_BIT,
6685                                    &iview->planes[0].isl,
6686                                    ISL_SURF_USAGE_RENDER_TARGET_BIT,
6687                                    aux_usage, &fast_clear_color,
6688                                    0, /* anv_image_view_state_flags */
6689                                    &gfx->color_att[i].surface_state,
6690                                    NULL);
6691 
6692       add_surface_state_relocs(cmd_buffer, gfx->color_att[i].surface_state);
6693 
6694       if (GFX_VER < 10 &&
6695           (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
6696            (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
6697           iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
6698           iview->planes[0].isl.base_level == 0 &&
6699           iview->planes[0].isl.base_array_layer == 0) {
6700          genX(copy_fast_clear_dwords)(cmd_buffer,
6701                                       gfx->color_att[i].surface_state.state,
6702                                       iview->image,
6703                                       VK_IMAGE_ASPECT_COLOR_BIT,
6704                                       false /* copy to ss */);
6705       }
6706 
6707       if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
6708          gfx->color_att[i].resolve_mode = att->resolveMode;
6709          gfx->color_att[i].resolve_iview =
6710             anv_image_view_from_handle(att->resolveImageView);
6711          gfx->color_att[i].resolve_layout = att->resolveImageLayout;
6712       }
6713    }
6714 
6715    const struct anv_image_view *fsr_iview = NULL;
6716    const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
6717       vk_find_struct_const(pRenderingInfo->pNext,
6718                            RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
6719    if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
6720       fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
6721       /* imageLayout and shadingRateAttachmentTexelSize are ignored */
6722    }
6723 
6724    const struct anv_image_view *ds_iview = NULL;
6725    const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
6726    const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
6727    if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
6728        (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
6729       const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
6730       VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6731       VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6732       VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6733       VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6734       enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
6735       enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
6736       float depth_clear_value = 0;
6737       uint32_t stencil_clear_value = 0;
6738 
6739       if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
6740          d_iview = anv_image_view_from_handle(d_att->imageView);
6741          initial_depth_layout = attachment_initial_layout(d_att);
6742          depth_layout = d_att->imageLayout;
6743          depth_aux_usage =
6744             anv_layout_to_aux_usage(&cmd_buffer->device->info,
6745                                     d_iview->image,
6746                                     VK_IMAGE_ASPECT_DEPTH_BIT,
6747                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
6748                                     depth_layout);
6749          depth_clear_value = d_att->clearValue.depthStencil.depth;
6750       }
6751 
6752       if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
6753          s_iview = anv_image_view_from_handle(s_att->imageView);
6754          initial_stencil_layout = attachment_initial_layout(s_att);
6755          stencil_layout = s_att->imageLayout;
6756          stencil_aux_usage =
6757             anv_layout_to_aux_usage(&cmd_buffer->device->info,
6758                                     s_iview->image,
6759                                     VK_IMAGE_ASPECT_STENCIL_BIT,
6760                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
6761                                     stencil_layout);
6762          stencil_clear_value = s_att->clearValue.depthStencil.stencil;
6763       }
6764 
6765       assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
6766       ds_iview = d_iview != NULL ? d_iview : s_iview;
6767       assert(ds_iview != NULL);
6768 
6769       assert(render_area.offset.x + render_area.extent.width <=
6770              ds_iview->vk.extent.width);
6771       assert(render_area.offset.y + render_area.extent.height <=
6772              ds_iview->vk.extent.height);
6773       assert(layers <= ds_iview->vk.layer_count);
6774 
6775       fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
6776       fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
6777 
6778       assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
6779       gfx->samples |= ds_iview->vk.image->samples;
6780 
6781       VkImageAspectFlags clear_aspects = 0;
6782       if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6783           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
6784          clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
6785       if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6786           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
6787          clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
6788 
6789       if (clear_aspects != 0) {
6790          const bool hiz_clear =
6791             anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
6792                                       depth_layout, clear_aspects,
6793                                       depth_clear_value,
6794                                       render_area);
6795 
6796          if (depth_layout != initial_depth_layout) {
6797             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6798                    render_area.extent.width == d_iview->vk.extent.width &&
6799                    render_area.extent.height == d_iview->vk.extent.height);
6800 
6801             if (is_multiview) {
6802                u_foreach_bit(view, gfx->view_mask) {
6803                   transition_depth_buffer(cmd_buffer, d_iview->image,
6804                                           d_iview->vk.base_array_layer + view,
6805                                           1 /* layer_count */,
6806                                           initial_depth_layout, depth_layout,
6807                                           hiz_clear);
6808                }
6809             } else {
6810                transition_depth_buffer(cmd_buffer, d_iview->image,
6811                                        d_iview->vk.base_array_layer,
6812                                        gfx->layer_count,
6813                                        initial_depth_layout, depth_layout,
6814                                        hiz_clear);
6815             }
6816          }
6817 
6818          if (stencil_layout != initial_stencil_layout) {
6819             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6820                    render_area.extent.width == s_iview->vk.extent.width &&
6821                    render_area.extent.height == s_iview->vk.extent.height);
6822 
6823             if (is_multiview) {
6824                u_foreach_bit(view, gfx->view_mask) {
6825                   transition_stencil_buffer(cmd_buffer, s_iview->image,
6826                                             s_iview->vk.base_mip_level, 1,
6827                                             s_iview->vk.base_array_layer + view,
6828                                             1 /* layer_count */,
6829                                             initial_stencil_layout,
6830                                             stencil_layout,
6831                                             hiz_clear);
6832                }
6833             } else {
6834                transition_stencil_buffer(cmd_buffer, s_iview->image,
6835                                          s_iview->vk.base_mip_level, 1,
6836                                          s_iview->vk.base_array_layer,
6837                                          gfx->layer_count,
6838                                          initial_stencil_layout,
6839                                          stencil_layout,
6840                                          hiz_clear);
6841             }
6842          }
6843 
6844          if (is_multiview) {
6845             uint32_t clear_view_mask = pRenderingInfo->viewMask;
6846             while (clear_view_mask) {
6847                int view = u_bit_scan(&clear_view_mask);
6848 
6849                uint32_t level = ds_iview->vk.base_mip_level;
6850                uint32_t layer = ds_iview->vk.base_array_layer + view;
6851 
6852                if (hiz_clear) {
6853                   anv_image_hiz_clear(cmd_buffer, ds_iview->image,
6854                                       clear_aspects,
6855                                       level, layer, 1,
6856                                       render_area,
6857                                       stencil_clear_value);
6858                } else {
6859                   anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
6860                                                 clear_aspects,
6861                                                 depth_aux_usage,
6862                                                 level, layer, 1,
6863                                                 render_area,
6864                                                 depth_clear_value,
6865                                                 stencil_clear_value);
6866                }
6867             }
6868          } else {
6869             uint32_t level = ds_iview->vk.base_mip_level;
6870             uint32_t base_layer = ds_iview->vk.base_array_layer;
6871             uint32_t layer_count = gfx->layer_count;
6872 
6873             if (hiz_clear) {
6874                anv_image_hiz_clear(cmd_buffer, ds_iview->image,
6875                                    clear_aspects,
6876                                    level, base_layer, layer_count,
6877                                    render_area,
6878                                    stencil_clear_value);
6879             } else {
6880                anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
6881                                              clear_aspects,
6882                                              depth_aux_usage,
6883                                              level, base_layer, layer_count,
6884                                              render_area,
6885                                              depth_clear_value,
6886                                              stencil_clear_value);
6887             }
6888          }
6889       } else {
6890          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
6891          assert(depth_layout == initial_depth_layout);
6892          assert(stencil_layout == initial_stencil_layout);
6893       }
6894 
6895       if (d_iview != NULL) {
6896          gfx->depth_att.vk_format = d_iview->vk.format;
6897          gfx->depth_att.iview = d_iview;
6898          gfx->depth_att.layout = depth_layout;
6899          gfx->depth_att.aux_usage = depth_aux_usage;
6900          if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
6901             assert(d_att->resolveImageView != VK_NULL_HANDLE);
6902             gfx->depth_att.resolve_mode = d_att->resolveMode;
6903             gfx->depth_att.resolve_iview =
6904                anv_image_view_from_handle(d_att->resolveImageView);
6905             gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
6906          }
6907       }
6908 
6909       if (s_iview != NULL) {
6910          gfx->stencil_att.vk_format = s_iview->vk.format;
6911          gfx->stencil_att.iview = s_iview;
6912          gfx->stencil_att.layout = stencil_layout;
6913          gfx->stencil_att.aux_usage = stencil_aux_usage;
6914          if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
6915             assert(s_att->resolveImageView != VK_NULL_HANDLE);
6916             gfx->stencil_att.resolve_mode = s_att->resolveMode;
6917             gfx->stencil_att.resolve_iview =
6918                anv_image_view_from_handle(s_att->resolveImageView);
6919             gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
6920          }
6921       }
6922    }
6923 
6924    /* Finally, now that we know the right size, set up the null surface */
6925    assert(util_bitcount(gfx->samples) <= 1);
6926    isl_null_fill_state(&cmd_buffer->device->isl_dev,
6927                        gfx->null_surface_state.map,
6928                        .size = fb_size);
6929 
6930    /****** We can now start emitting code to begin the render pass ******/
6931 
6932    gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
6933 
6934    /* Our implementation of VK_KHR_multiview uses instancing to draw the
6935     * different views.  If the client asks for instancing, we need to use the
6936     * Instance Data Step Rate to ensure that we repeat the client's
6937     * per-instance data once for each view.  Since this bit is in
6938     * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
6939     * of each subpass.
6940     */
6941    if (GFX_VER == 7)
6942       gfx->vb_dirty |= ~0;
6943 
6944    /* It is possible to start a render pass with an old pipeline.  Because the
6945     * render pass and subpass index are both baked into the pipeline, this is
6946     * highly unlikely.  In order to do so, it requires that you have a render
6947     * pass with a single subpass and that you use that render pass twice
6948     * back-to-back and use the same pipeline at the start of the second render
6949     * pass as at the end of the first.  In order to avoid unpredictable issues
6950     * with this edge case, we just dirty the pipeline at the start of every
6951     * subpass.
6952     */
6953    gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
6954 
6955 #if GFX_VER >= 11
6956    /* The PIPE_CONTROL command description says:
6957     *
6958     *    "Whenever a Binding Table Index (BTI) used by a Render Taget Message
6959     *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
6960     *     Target Cache Flush by enabling this bit. When render target flush
6961     *     is set due to new association of BTI, PS Scoreboard Stall bit must
6962     *     be set in this packet."
6963     */
6964    anv_add_pending_pipe_bits(cmd_buffer,
6965                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
6966                              ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
6967                              "change RT");
6968 #endif
6969 
6970    cmd_buffer_emit_depth_stencil(cmd_buffer);
6971 
6972    cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
6973 }
6974 
6975 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)6976 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
6977                                    struct anv_attachment *att,
6978                                    VkImageAspectFlagBits aspect)
6979 {
6980    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6981    const struct anv_image_view *iview = att->iview;
6982 
6983    if (gfx->view_mask == 0) {
6984       genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6985                                           aspect, att->aux_usage,
6986                                           iview->planes[0].isl.base_level,
6987                                           iview->planes[0].isl.base_array_layer,
6988                                           gfx->layer_count);
6989    } else {
6990       uint32_t res_view_mask = gfx->view_mask;
6991       while (res_view_mask) {
6992          int i = u_bit_scan(&res_view_mask);
6993 
6994          const uint32_t level = iview->planes[0].isl.base_level;
6995          const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
6996 
6997          genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6998                                              aspect, att->aux_usage,
6999                                              level, layer, 1);
7000       }
7001    }
7002 }
7003 
7004 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)7005 vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
7006 {
7007    switch (vk_mode) {
7008    case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
7009       return BLORP_FILTER_SAMPLE_0;
7010    case VK_RESOLVE_MODE_AVERAGE_BIT:
7011       return BLORP_FILTER_AVERAGE;
7012    case VK_RESOLVE_MODE_MIN_BIT:
7013       return BLORP_FILTER_MIN_SAMPLE;
7014    case VK_RESOLVE_MODE_MAX_BIT:
7015       return BLORP_FILTER_MAX_SAMPLE;
7016    default:
7017       return BLORP_FILTER_NONE;
7018    }
7019 }
7020 
7021 static void
cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer * cmd_buffer,const struct anv_attachment * att,VkImageLayout layout,VkImageAspectFlagBits aspect)7022 cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
7023                                    const struct anv_attachment *att,
7024                                    VkImageLayout layout,
7025                                    VkImageAspectFlagBits aspect)
7026 {
7027    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
7028    const struct anv_image_view *src_iview = att->iview;
7029    const struct anv_image_view *dst_iview = att->resolve_iview;
7030 
7031    enum isl_aux_usage src_aux_usage =
7032       anv_layout_to_aux_usage(&cmd_buffer->device->info,
7033                               src_iview->image, aspect,
7034                               VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
7035                               layout);
7036 
7037    enum isl_aux_usage dst_aux_usage =
7038       anv_layout_to_aux_usage(&cmd_buffer->device->info,
7039                               dst_iview->image, aspect,
7040                               VK_IMAGE_USAGE_TRANSFER_DST_BIT,
7041                               att->resolve_layout);
7042 
7043    enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
7044 
7045    const VkRect2D render_area = gfx->render_area;
7046    if (gfx->view_mask == 0) {
7047       anv_image_msaa_resolve(cmd_buffer,
7048                              src_iview->image, src_aux_usage,
7049                              src_iview->planes[0].isl.base_level,
7050                              src_iview->planes[0].isl.base_array_layer,
7051                              dst_iview->image, dst_aux_usage,
7052                              dst_iview->planes[0].isl.base_level,
7053                              dst_iview->planes[0].isl.base_array_layer,
7054                              aspect,
7055                              render_area.offset.x, render_area.offset.y,
7056                              render_area.offset.x, render_area.offset.y,
7057                              render_area.extent.width,
7058                              render_area.extent.height,
7059                              gfx->layer_count, filter);
7060    } else {
7061       uint32_t res_view_mask = gfx->view_mask;
7062       while (res_view_mask) {
7063          int i = u_bit_scan(&res_view_mask);
7064 
7065          anv_image_msaa_resolve(cmd_buffer,
7066                                 src_iview->image, src_aux_usage,
7067                                 src_iview->planes[0].isl.base_level,
7068                                 src_iview->planes[0].isl.base_array_layer + i,
7069                                 dst_iview->image, dst_aux_usage,
7070                                 dst_iview->planes[0].isl.base_level,
7071                                 dst_iview->planes[0].isl.base_array_layer + i,
7072                                 aspect,
7073                                 render_area.offset.x, render_area.offset.y,
7074                                 render_area.offset.x, render_area.offset.y,
7075                                 render_area.extent.width,
7076                                 render_area.extent.height,
7077                                 1, filter);
7078       }
7079    }
7080 }
7081 
genX(CmdEndRendering)7082 void genX(CmdEndRendering)(
7083     VkCommandBuffer                             commandBuffer)
7084 {
7085    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7086    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
7087 
7088    if (anv_batch_has_error(&cmd_buffer->batch))
7089       return;
7090 
7091    const bool is_multiview = gfx->view_mask != 0;
7092    const uint32_t layers =
7093       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
7094 
7095    bool has_color_resolve = false;
7096    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
7097       if (gfx->color_att[i].iview == NULL)
7098          continue;
7099 
7100       cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
7101                                          VK_IMAGE_ASPECT_COLOR_BIT);
7102 
7103       /* Stash this off for later */
7104       if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
7105           !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
7106          has_color_resolve = true;
7107    }
7108 
7109    if (gfx->depth_att.iview != NULL) {
7110       cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
7111                                          VK_IMAGE_ASPECT_DEPTH_BIT);
7112    }
7113 
7114    if (gfx->stencil_att.iview != NULL) {
7115       cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
7116                                          VK_IMAGE_ASPECT_STENCIL_BIT);
7117    }
7118 
7119    if (has_color_resolve) {
7120       /* We are about to do some MSAA resolves.  We need to flush so that the
7121        * result of writes to the MSAA color attachments show up in the sampler
7122        * when we blit to the single-sampled resolve target.
7123        */
7124       anv_add_pending_pipe_bits(cmd_buffer,
7125                                 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
7126                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
7127                                 "MSAA resolve");
7128    }
7129 
7130    if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
7131        gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) {
7132       /* We are about to do some MSAA resolves.  We need to flush so that the
7133        * result of writes to the MSAA depth attachments show up in the sampler
7134        * when we blit to the single-sampled resolve target.
7135        */
7136       anv_add_pending_pipe_bits(cmd_buffer,
7137                               ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
7138                               ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
7139                               "MSAA resolve");
7140    }
7141 
7142    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
7143       const struct anv_attachment *att = &gfx->color_att[i];
7144       if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
7145           (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
7146          continue;
7147 
7148       cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
7149                                          VK_IMAGE_ASPECT_COLOR_BIT);
7150    }
7151 
7152    if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
7153        !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
7154       const struct anv_image_view *src_iview = gfx->depth_att.iview;
7155 
7156       /* MSAA resolves sample from the source attachment.  Transition the
7157        * depth attachment first to get rid of any HiZ that we may not be
7158        * able to handle.
7159        */
7160       transition_depth_buffer(cmd_buffer, src_iview->image,
7161                               src_iview->planes[0].isl.base_array_layer,
7162                               layers,
7163                               gfx->depth_att.layout,
7164                               VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7165                               false /* will_full_fast_clear */);
7166 
7167       cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
7168                                          VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7169                                          VK_IMAGE_ASPECT_DEPTH_BIT);
7170 
7171       /* Transition the source back to the original layout.  This seems a bit
7172        * inefficient but, since HiZ resolves aren't destructive, going from
7173        * less HiZ to more is generally a no-op.
7174        */
7175       transition_depth_buffer(cmd_buffer, src_iview->image,
7176                               src_iview->planes[0].isl.base_array_layer,
7177                               layers,
7178                               VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7179                               gfx->depth_att.layout,
7180                               false /* will_full_fast_clear */);
7181    }
7182 
7183    if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
7184        !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
7185       cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
7186                                          gfx->stencil_att.layout,
7187                                          VK_IMAGE_ASPECT_STENCIL_BIT);
7188    }
7189 
7190 #if GFX_VER == 7
7191    /* On gfx7, we have to store a texturable version of the stencil buffer in
7192     * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
7193     * forth at strategic points. Stencil writes are only allowed in following
7194     * layouts:
7195     *
7196     *  - VK_IMAGE_LAYOUT_GENERAL
7197     *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
7198     *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
7199     *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
7200     *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR
7201     *
7202     * For general, we have no nice opportunity to transition so we do the copy
7203     * to the shadow unconditionally at the end of the subpass. For transfer
7204     * destinations, we can update it as part of the transfer op. For the other
7205     * layouts, we delay the copy until a transition into some other layout.
7206     */
7207    if (gfx->stencil_att.iview != NULL) {
7208       const struct anv_image_view *iview = gfx->stencil_att.iview;
7209       const struct anv_image *image = iview->image;
7210       const uint32_t plane =
7211          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
7212 
7213       if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
7214           gfx->stencil_att.layout == VK_IMAGE_LAYOUT_GENERAL) {
7215          anv_image_copy_to_shadow(cmd_buffer, image,
7216                                   VK_IMAGE_ASPECT_STENCIL_BIT,
7217                                   iview->planes[plane].isl.base_level, 1,
7218                                   iview->planes[plane].isl.base_array_layer,
7219                                   layers);
7220       }
7221    }
7222 #endif
7223 
7224    anv_cmd_buffer_reset_rendering(cmd_buffer);
7225 }
7226 
7227 void
genX(cmd_emit_conditional_render_predicate)7228 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
7229 {
7230 #if GFX_VERx10 >= 75
7231    struct mi_builder b;
7232    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7233 
7234    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
7235                 mi_reg32(ANV_PREDICATE_RESULT_REG));
7236    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7237 
7238    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
7239       mip.LoadOperation    = LOAD_LOADINV;
7240       mip.CombineOperation = COMBINE_SET;
7241       mip.CompareOperation = COMPARE_SRCS_EQUAL;
7242    }
7243 #endif
7244 }
7245 
7246 #if GFX_VERx10 >= 75
genX(CmdBeginConditionalRenderingEXT)7247 void genX(CmdBeginConditionalRenderingEXT)(
7248    VkCommandBuffer                             commandBuffer,
7249    const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
7250 {
7251    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7252    ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
7253    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
7254    struct anv_address value_address =
7255       anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
7256 
7257    const bool isInverted = pConditionalRenderingBegin->flags &
7258                            VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
7259 
7260    cmd_state->conditional_render_enabled = true;
7261 
7262    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7263 
7264    struct mi_builder b;
7265    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7266 
7267    /* Section 19.4 of the Vulkan 1.1.85 spec says:
7268     *
7269     *    If the value of the predicate in buffer memory changes
7270     *    while conditional rendering is active, the rendering commands
7271     *    may be discarded in an implementation-dependent way.
7272     *    Some implementations may latch the value of the predicate
7273     *    upon beginning conditional rendering while others
7274     *    may read it before every rendering command.
7275     *
7276     * So it's perfectly fine to read a value from the buffer once.
7277     */
7278    struct mi_value value =  mi_mem32(value_address);
7279 
7280    /* Precompute predicate result, it is necessary to support secondary
7281     * command buffers since it is unknown if conditional rendering is
7282     * inverted when populating them.
7283     */
7284    mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
7285                 isInverted ? mi_uge(&b, mi_imm(0), value) :
7286                              mi_ult(&b, mi_imm(0), value));
7287 }
7288 
genX(CmdEndConditionalRenderingEXT)7289 void genX(CmdEndConditionalRenderingEXT)(
7290 	VkCommandBuffer                             commandBuffer)
7291 {
7292    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7293    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
7294 
7295    cmd_state->conditional_render_enabled = false;
7296 }
7297 #endif
7298 
7299 /* Set of stage bits for which are pipelined, i.e. they get queued
7300  * by the command streamer for later execution.
7301  */
7302 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
7303    ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | \
7304      VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT_KHR | \
7305      VK_PIPELINE_STAGE_2_HOST_BIT_KHR | \
7306      VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
7307 
genX(CmdSetEvent2KHR)7308 void genX(CmdSetEvent2KHR)(
7309     VkCommandBuffer                             commandBuffer,
7310     VkEvent                                     _event,
7311     const VkDependencyInfoKHR*                  pDependencyInfo)
7312 {
7313    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7314    ANV_FROM_HANDLE(anv_event, event, _event);
7315 
7316    VkPipelineStageFlags2KHR src_stages = 0;
7317 
7318    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
7319       src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7320    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
7321       src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7322    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
7323       src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7324 
7325    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
7326    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7327 
7328    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
7329       if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
7330          pc.StallAtPixelScoreboard = true;
7331          pc.CommandStreamerStallEnable = true;
7332       }
7333 
7334       pc.DestinationAddressType  = DAT_PPGTT,
7335       pc.PostSyncOperation       = WriteImmediateData,
7336       pc.Address = (struct anv_address) {
7337          cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7338          event->state.offset
7339       };
7340       pc.ImmediateData           = VK_EVENT_SET;
7341       anv_debug_dump_pc(pc);
7342    }
7343 }
7344 
genX(CmdResetEvent2KHR)7345 void genX(CmdResetEvent2KHR)(
7346     VkCommandBuffer                             commandBuffer,
7347     VkEvent                                     _event,
7348     VkPipelineStageFlags2KHR                    stageMask)
7349 {
7350    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7351    ANV_FROM_HANDLE(anv_event, event, _event);
7352 
7353    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
7354    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7355 
7356    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
7357       if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
7358          pc.StallAtPixelScoreboard = true;
7359          pc.CommandStreamerStallEnable = true;
7360       }
7361 
7362       pc.DestinationAddressType  = DAT_PPGTT;
7363       pc.PostSyncOperation       = WriteImmediateData;
7364       pc.Address = (struct anv_address) {
7365          cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7366          event->state.offset
7367       };
7368       pc.ImmediateData           = VK_EVENT_RESET;
7369       anv_debug_dump_pc(pc);
7370    }
7371 }
7372 
genX(CmdWaitEvents2KHR)7373 void genX(CmdWaitEvents2KHR)(
7374     VkCommandBuffer                             commandBuffer,
7375     uint32_t                                    eventCount,
7376     const VkEvent*                              pEvents,
7377     const VkDependencyInfoKHR*                  pDependencyInfos)
7378 {
7379    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7380 
7381 #if GFX_VER >= 8
7382    for (uint32_t i = 0; i < eventCount; i++) {
7383       ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
7384 
7385       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
7386          sem.WaitMode            = PollingMode,
7387          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD,
7388          sem.SemaphoreDataDword  = VK_EVENT_SET,
7389          sem.SemaphoreAddress = (struct anv_address) {
7390             cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7391             event->state.offset
7392          };
7393       }
7394    }
7395 #else
7396    anv_finishme("Implement events on gfx7");
7397 #endif
7398 
7399    cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
7400 }
7401 
genX(CmdSetPerformanceOverrideINTEL)7402 VkResult genX(CmdSetPerformanceOverrideINTEL)(
7403     VkCommandBuffer                             commandBuffer,
7404     const VkPerformanceOverrideInfoINTEL*       pOverrideInfo)
7405 {
7406    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7407 
7408    switch (pOverrideInfo->type) {
7409    case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
7410 #if GFX_VER >= 9
7411       anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
7412          csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
7413          csdm2.MediaInstructionDisable = pOverrideInfo->enable;
7414          csdm2._3DRenderingInstructionDisableMask = true;
7415          csdm2.MediaInstructionDisableMask = true;
7416       }
7417 #else
7418       anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
7419          instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
7420          instpm.MediaInstructionDisable = pOverrideInfo->enable;
7421          instpm._3DRenderingInstructionDisableMask = true;
7422          instpm.MediaInstructionDisableMask = true;
7423       }
7424 #endif
7425       break;
7426    }
7427 
7428    case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
7429       if (pOverrideInfo->enable) {
7430          /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
7431          anv_add_pending_pipe_bits(cmd_buffer,
7432                                    ANV_PIPE_FLUSH_BITS |
7433                                    ANV_PIPE_INVALIDATE_BITS,
7434                                    "perf counter isolation");
7435          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7436       }
7437       break;
7438 
7439    default:
7440       unreachable("Invalid override");
7441    }
7442 
7443    return VK_SUCCESS;
7444 }
7445 
genX(CmdSetPerformanceStreamMarkerINTEL)7446 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
7447     VkCommandBuffer                             commandBuffer,
7448     const VkPerformanceStreamMarkerInfoINTEL*   pMarkerInfo)
7449 {
7450    /* TODO: Waiting on the register to write, might depend on generation. */
7451 
7452    return VK_SUCCESS;
7453 }
7454 
7455 #define TIMESTAMP 0x2358
7456 
genX(cmd_emit_timestamp)7457 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
7458                               struct anv_device *device,
7459                               struct anv_address addr,
7460                               bool end_of_pipe) {
7461    if (end_of_pipe) {
7462       anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
7463          pc.PostSyncOperation   = WriteTimestamp;
7464          pc.Address             = addr;
7465          anv_debug_dump_pc(pc);
7466       }
7467    } else {
7468       struct mi_builder b;
7469       mi_builder_init(&b, &device->info, batch);
7470       mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
7471    }
7472 }
7473