1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_format.h"
30 #include "vk_util.h"
31 #include "util/fast_idiv_by_const.h"
32 
33 #include "common/intel_aux_map.h"
34 #include "common/intel_l3_config.h"
35 #include "genxml/gen_macros.h"
36 #include "genxml/genX_pack.h"
37 #include "genxml/gen_rt_pack.h"
38 
39 #include "nir/nir_xfb_info.h"
40 
41 /* We reserve :
42  *    - GPR 14 for secondary command buffer returns
43  *    - GPR 15 for conditional rendering
44  */
45 #define MI_BUILDER_NUM_ALLOC_GPRS 14
46 #define __gen_get_batch_dwords anv_batch_emit_dwords
47 #define __gen_address_offset anv_address_add
48 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
49 #include "common/mi_builder.h"
50 
51 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
52                                         uint32_t pipeline);
53 
54 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)55 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
56    enum anv_pipe_bits bits = 0;
57    bits |= (pc->DepthCacheFlushEnable) ?  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
58    bits |= (pc->DCFlushEnable) ?  ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
59 #if GFX_VER >= 12
60    bits |= (pc->TileCacheFlushEnable) ?  ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
61    bits |= (pc->HDCPipelineFlushEnable) ?  ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
62 #endif
63    bits |= (pc->RenderTargetCacheFlushEnable) ?  ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
64    bits |= (pc->StateCacheInvalidationEnable) ?  ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
65    bits |= (pc->ConstantCacheInvalidationEnable) ?  ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
66    bits |= (pc->TextureCacheInvalidationEnable) ?  ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
67    bits |= (pc->InstructionCacheInvalidateEnable) ?  ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
68    bits |= (pc->StallAtPixelScoreboard) ?  ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
69    bits |= (pc->DepthStallEnable) ?  ANV_PIPE_DEPTH_STALL_BIT : 0;
70    bits |= (pc->CommandStreamerStallEnable) ?  ANV_PIPE_CS_STALL_BIT : 0;
71    return bits;
72 }
73 
74 #define anv_debug_dump_pc(pc) \
75    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
76       fputs("pc: emit PC=( ", stderr); \
77       anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
78       fprintf(stderr, ") reason: %s\n", __FUNCTION__); \
79    }
80 
81 static bool
is_render_queue_cmd_buffer(const struct anv_cmd_buffer * cmd_buffer)82 is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)
83 {
84    struct anv_queue_family *queue_family = cmd_buffer->pool->queue_family;
85    return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
86 }
87 
88 void
genX(cmd_buffer_emit_state_base_address)89 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
90 {
91    struct anv_device *device = cmd_buffer->device;
92    UNUSED const struct intel_device_info *devinfo = &device->info;
93    uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
94 
95    /* If we are emitting a new state base address we probably need to re-emit
96     * binding tables.
97     */
98    cmd_buffer->state.descriptors_dirty |= ~0;
99 
100    /* Emit a render target cache flush.
101     *
102     * This isn't documented anywhere in the PRM.  However, it seems to be
103     * necessary prior to changing the surface state base adress.  Without
104     * this, we get GPU hangs when using multi-level command buffers which
105     * clear depth, reset state base address, and then go render stuff.
106     */
107    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
108 #if GFX_VER >= 12
109       pc.HDCPipelineFlushEnable = true;
110 #else
111       pc.DCFlushEnable = true;
112 #endif
113       pc.RenderTargetCacheFlushEnable = true;
114       pc.CommandStreamerStallEnable = true;
115 #if GFX_VER == 12
116       /* Wa_1606662791:
117        *
118        *   Software must program PIPE_CONTROL command with "HDC Pipeline
119        *   Flush" prior to programming of the below two non-pipeline state :
120        *      * STATE_BASE_ADDRESS
121        *      * 3DSTATE_BINDING_TABLE_POOL_ALLOC
122        */
123       if (devinfo->revision == 0 /* A0 */)
124          pc.HDCPipelineFlushEnable = true;
125 #endif
126       anv_debug_dump_pc(pc);
127    }
128 
129 #if GFX_VER == 12
130    /* Wa_1607854226:
131     *
132     *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
133     *  mode by putting the pipeline temporarily in 3D mode.
134     */
135    uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
136    genX(flush_pipeline_select_3d)(cmd_buffer);
137 #endif
138 
139    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
140       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
141       sba.GeneralStateMOCS = mocs;
142       sba.GeneralStateBaseAddressModifyEnable = true;
143 
144       sba.StatelessDataPortAccessMOCS = mocs;
145 
146       sba.SurfaceStateBaseAddress =
147          anv_cmd_buffer_surface_base_address(cmd_buffer);
148       sba.SurfaceStateMOCS = mocs;
149       sba.SurfaceStateBaseAddressModifyEnable = true;
150 
151       sba.DynamicStateBaseAddress =
152          (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
153       sba.DynamicStateMOCS = mocs;
154       sba.DynamicStateBaseAddressModifyEnable = true;
155 
156       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
157       sba.IndirectObjectMOCS = mocs;
158       sba.IndirectObjectBaseAddressModifyEnable = true;
159 
160       sba.InstructionBaseAddress =
161          (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
162       sba.InstructionMOCS = mocs;
163       sba.InstructionBaseAddressModifyEnable = true;
164 
165 #  if (GFX_VER >= 8)
166       /* Broadwell requires that we specify a buffer size for a bunch of
167        * these fields.  However, since we will be growing the BO's live, we
168        * just set them all to the maximum.
169        */
170       sba.GeneralStateBufferSize       = 0xfffff;
171       sba.IndirectObjectBufferSize     = 0xfffff;
172       if (anv_use_softpin(device->physical)) {
173          /* With softpin, we use fixed addresses so we actually know how big
174           * our base addresses are.
175           */
176          sba.DynamicStateBufferSize    = DYNAMIC_STATE_POOL_SIZE / 4096;
177          sba.InstructionBufferSize     = INSTRUCTION_STATE_POOL_SIZE / 4096;
178       } else {
179          sba.DynamicStateBufferSize    = 0xfffff;
180          sba.InstructionBufferSize     = 0xfffff;
181       }
182       sba.GeneralStateBufferSizeModifyEnable    = true;
183       sba.IndirectObjectBufferSizeModifyEnable  = true;
184       sba.DynamicStateBufferSizeModifyEnable    = true;
185       sba.InstructionBuffersizeModifyEnable     = true;
186 #  else
187       /* On gfx7, we have upper bounds instead.  According to the docs,
188        * setting an upper bound of zero means that no bounds checking is
189        * performed so, in theory, we should be able to leave them zero.
190        * However, border color is broken and the GPU bounds-checks anyway.
191        * To avoid this and other potential problems, we may as well set it
192        * for everything.
193        */
194       sba.GeneralStateAccessUpperBound =
195          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
196       sba.GeneralStateAccessUpperBoundModifyEnable = true;
197       sba.DynamicStateAccessUpperBound =
198          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
199       sba.DynamicStateAccessUpperBoundModifyEnable = true;
200       sba.InstructionAccessUpperBound =
201          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
202       sba.InstructionAccessUpperBoundModifyEnable = true;
203 #  endif
204 #  if (GFX_VER >= 9)
205       if (anv_use_softpin(device->physical)) {
206          sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
207             .bo = device->surface_state_pool.block_pool.bo,
208             .offset = 0,
209          };
210          sba.BindlessSurfaceStateSize = (1 << 20) - 1;
211       } else {
212          sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS;
213          sba.BindlessSurfaceStateSize = 0;
214       }
215       sba.BindlessSurfaceStateMOCS = mocs;
216       sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
217 #  endif
218 #  if (GFX_VER >= 10)
219       sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
220       sba.BindlessSamplerStateMOCS = mocs;
221       sba.BindlessSamplerStateBaseAddressModifyEnable = true;
222       sba.BindlessSamplerStateBufferSize = 0;
223 #  endif
224    }
225 
226 #if GFX_VER == 12
227    /* Wa_1607854226:
228     *
229     *  Put the pipeline back into its current mode.
230     */
231    if (gfx12_wa_pipeline != UINT32_MAX)
232       genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
233 #endif
234 
235    /* After re-setting the surface state base address, we have to do some
236     * cache flusing so that the sampler engine will pick up the new
237     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
238     * Shared Function > 3D Sampler > State > State Caching (page 96):
239     *
240     *    Coherency with system memory in the state cache, like the texture
241     *    cache is handled partially by software. It is expected that the
242     *    command stream or shader will issue Cache Flush operation or
243     *    Cache_Flush sampler message to ensure that the L1 cache remains
244     *    coherent with system memory.
245     *
246     *    [...]
247     *
248     *    Whenever the value of the Dynamic_State_Base_Addr,
249     *    Surface_State_Base_Addr are altered, the L1 state cache must be
250     *    invalidated to ensure the new surface or sampler state is fetched
251     *    from system memory.
252     *
253     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
254     * which, according the PIPE_CONTROL instruction documentation in the
255     * Broadwell PRM:
256     *
257     *    Setting this bit is independent of any other bit in this packet.
258     *    This bit controls the invalidation of the L1 and L2 state caches
259     *    at the top of the pipe i.e. at the parsing time.
260     *
261     * Unfortunately, experimentation seems to indicate that state cache
262     * invalidation through a PIPE_CONTROL does nothing whatsoever in
263     * regards to surface state and binding tables.  In stead, it seems that
264     * invalidating the texture cache is what is actually needed.
265     *
266     * XXX:  As far as we have been able to determine through
267     * experimentation, shows that flush the texture cache appears to be
268     * sufficient.  The theory here is that all of the sampling/rendering
269     * units cache the binding table in the texture cache.  However, we have
270     * yet to be able to actually confirm this.
271     */
272    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
273       pc.TextureCacheInvalidationEnable = true;
274       pc.ConstantCacheInvalidationEnable = true;
275       pc.StateCacheInvalidationEnable = true;
276       anv_debug_dump_pc(pc);
277    }
278 }
279 
280 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_state state,struct anv_address addr)281 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
282                   struct anv_state state, struct anv_address addr)
283 {
284    VkResult result;
285 
286    if (anv_use_softpin(cmd_buffer->device->physical)) {
287       result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
288                                      &cmd_buffer->pool->alloc,
289                                      addr.bo);
290    } else {
291       const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
292       result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
293                                   &cmd_buffer->pool->alloc,
294                                   state.offset + isl_dev->ss.addr_offset,
295                                   addr.bo, addr.offset, NULL);
296    }
297 
298    if (unlikely(result != VK_SUCCESS))
299       anv_batch_set_error(&cmd_buffer->batch, result);
300 }
301 
302 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,struct anv_surface_state state)303 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
304                          struct anv_surface_state state)
305 {
306    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
307 
308    assert(!anv_address_is_null(state.address));
309    add_surface_reloc(cmd_buffer, state.state, state.address);
310 
311    if (!anv_address_is_null(state.aux_address)) {
312       VkResult result =
313          anv_reloc_list_add(&cmd_buffer->surface_relocs,
314                             &cmd_buffer->pool->alloc,
315                             state.state.offset + isl_dev->ss.aux_addr_offset,
316                             state.aux_address.bo,
317                             state.aux_address.offset,
318                             NULL);
319       if (result != VK_SUCCESS)
320          anv_batch_set_error(&cmd_buffer->batch, result);
321    }
322 
323    if (!anv_address_is_null(state.clear_address)) {
324       VkResult result =
325          anv_reloc_list_add(&cmd_buffer->surface_relocs,
326                             &cmd_buffer->pool->alloc,
327                             state.state.offset +
328                             isl_dev->ss.clear_color_state_offset,
329                             state.clear_address.bo,
330                             state.clear_address.offset,
331                             NULL);
332       if (result != VK_SUCCESS)
333          anv_batch_set_error(&cmd_buffer->batch, result);
334    }
335 }
336 
337 static bool
isl_color_value_requires_conversion(union isl_color_value color,const struct isl_surf * surf,const struct isl_view * view)338 isl_color_value_requires_conversion(union isl_color_value color,
339                                     const struct isl_surf *surf,
340                                     const struct isl_view *view)
341 {
342    if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
343       return false;
344 
345    uint32_t surf_pack[4] = { 0, 0, 0, 0 };
346    isl_color_value_pack(&color, surf->format, surf_pack);
347 
348    uint32_t view_pack[4] = { 0, 0, 0, 0 };
349    union isl_color_value swiz_color =
350       isl_color_value_swizzle_inv(color, view->swizzle);
351    isl_color_value_pack(&swiz_color, view->format, view_pack);
352 
353    return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
354 }
355 
356 static bool
anv_can_fast_clear_color_view(struct anv_device * device,struct anv_image_view * iview,VkImageLayout layout,union isl_color_value clear_color,uint32_t num_layers,VkRect2D render_area)357 anv_can_fast_clear_color_view(struct anv_device * device,
358                               struct anv_image_view *iview,
359                               VkImageLayout layout,
360                               union isl_color_value clear_color,
361                               uint32_t num_layers,
362                               VkRect2D render_area)
363 {
364    if (iview->planes[0].isl.base_array_layer >=
365        anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
366                             iview->planes[0].isl.base_level))
367       return false;
368 
369    /* Start by getting the fast clear type.  We use the first subpass
370     * layout here because we don't want to fast-clear if the first subpass
371     * to use the attachment can't handle fast-clears.
372     */
373    enum anv_fast_clear_type fast_clear_type =
374       anv_layout_to_fast_clear_type(&device->info, iview->image,
375                                     VK_IMAGE_ASPECT_COLOR_BIT,
376                                     layout);
377    switch (fast_clear_type) {
378    case ANV_FAST_CLEAR_NONE:
379       return false;
380    case ANV_FAST_CLEAR_DEFAULT_VALUE:
381       if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
382          return false;
383       break;
384    case ANV_FAST_CLEAR_ANY:
385       break;
386    }
387 
388    /* Potentially, we could do partial fast-clears but doing so has crazy
389     * alignment restrictions.  It's easier to just restrict to full size
390     * fast clears for now.
391     */
392    if (render_area.offset.x != 0 ||
393        render_area.offset.y != 0 ||
394        render_area.extent.width != iview->vk.extent.width ||
395        render_area.extent.height != iview->vk.extent.height)
396       return false;
397 
398    /* On Broadwell and earlier, we can only handle 0/1 clear colors */
399    if (GFX_VER <= 8 &&
400        !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
401       return false;
402 
403    /* If the clear color is one that would require non-trivial format
404     * conversion on resolve, we don't bother with the fast clear.  This
405     * shouldn't be common as most clear colors are 0/1 and the most common
406     * format re-interpretation is for sRGB.
407     */
408    if (isl_color_value_requires_conversion(clear_color,
409                                            &iview->image->planes[0].primary_surface.isl,
410                                            &iview->planes[0].isl)) {
411       anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
412                     "Cannot fast-clear to colors which would require "
413                     "format conversion on resolve");
414       return false;
415    }
416 
417    /* We only allow fast clears to the first slice of an image (level 0,
418     * layer 0) and only for the entire slice.  This guarantees us that, at
419     * any given time, there is only one clear color on any given image at
420     * any given time.  At the time of our testing (Jan 17, 2018), there
421     * were no known applications which would benefit from fast-clearing
422     * more than just the first slice.
423     */
424    if (iview->planes[0].isl.base_level > 0 ||
425        iview->planes[0].isl.base_array_layer > 0) {
426       anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
427                     "Rendering with multi-lod or multi-layer framebuffer "
428                     "with LOAD_OP_LOAD and baseMipLevel > 0 or "
429                     "baseArrayLayer > 0.  Not fast clearing.");
430       return false;
431    }
432 
433    if (num_layers > 1) {
434       anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
435                     "Rendering to a multi-layer framebuffer with "
436                     "LOAD_OP_CLEAR.  Only fast-clearing the first slice");
437    }
438 
439    return true;
440 }
441 
442 static bool
anv_can_hiz_clear_ds_view(struct anv_device * device,struct anv_image_view * iview,VkImageLayout layout,VkImageAspectFlags clear_aspects,float depth_clear_value,VkRect2D render_area)443 anv_can_hiz_clear_ds_view(struct anv_device *device,
444                           struct anv_image_view *iview,
445                           VkImageLayout layout,
446                           VkImageAspectFlags clear_aspects,
447                           float depth_clear_value,
448                           VkRect2D render_area)
449 {
450    /* We don't do any HiZ or depth fast-clears on gfx7 yet */
451    if (GFX_VER == 7)
452       return false;
453 
454    /* If we're just clearing stencil, we can always HiZ clear */
455    if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
456       return true;
457 
458    /* We must have depth in order to have HiZ */
459    if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
460       return false;
461 
462    const enum isl_aux_usage clear_aux_usage =
463       anv_layout_to_aux_usage(&device->info, iview->image,
464                               VK_IMAGE_ASPECT_DEPTH_BIT,
465                               VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
466                               layout);
467    if (!blorp_can_hiz_clear_depth(&device->info,
468                                   &iview->image->planes[0].primary_surface.isl,
469                                   clear_aux_usage,
470                                   iview->planes[0].isl.base_level,
471                                   iview->planes[0].isl.base_array_layer,
472                                   render_area.offset.x,
473                                   render_area.offset.y,
474                                   render_area.offset.x +
475                                   render_area.extent.width,
476                                   render_area.offset.y +
477                                   render_area.extent.height))
478       return false;
479 
480    if (depth_clear_value != ANV_HZ_FC_VAL)
481       return false;
482 
483    /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
484     * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
485     * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
486     */
487    if (GFX_VER == 8 && anv_can_sample_with_hiz(&device->info, iview->image))
488       return false;
489 
490    /* If we got here, then we can fast clear */
491    return true;
492 }
493 
494 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
495 
496 #if GFX_VER == 12
497 static void
anv_image_init_aux_tt(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count)498 anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
499                       const struct anv_image *image,
500                       VkImageAspectFlagBits aspect,
501                       uint32_t base_level, uint32_t level_count,
502                       uint32_t base_layer, uint32_t layer_count)
503 {
504    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
505 
506    const struct anv_surface *surface = &image->planes[plane].primary_surface;
507    uint64_t base_address =
508       anv_address_physical(anv_image_address(image, &surface->memory_range));
509 
510    const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl;
511    uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf);
512 
513    /* We're about to live-update the AUX-TT.  We really don't want anyone else
514     * trying to read it while we're doing this.  We could probably get away
515     * with not having this stall in some cases if we were really careful but
516     * it's better to play it safe.  Full stall the GPU.
517     */
518    anv_add_pending_pipe_bits(cmd_buffer,
519                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
520                              "before update AUX-TT");
521    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
522 
523    struct mi_builder b;
524    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
525 
526    for (uint32_t a = 0; a < layer_count; a++) {
527       const uint32_t layer = base_layer + a;
528 
529       uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
530       for (uint32_t l = 0; l < level_count; l++) {
531          const uint32_t level = base_level + l;
532 
533          uint32_t logical_array_layer, logical_z_offset_px;
534          if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
535             logical_array_layer = 0;
536 
537             /* If the given miplevel does not have this layer, then any higher
538              * miplevels won't either because miplevels only get smaller the
539              * higher the LOD.
540              */
541             assert(layer < image->vk.extent.depth);
542             if (layer >= anv_minify(image->vk.extent.depth, level))
543                break;
544             logical_z_offset_px = layer;
545          } else {
546             assert(layer < image->vk.array_layers);
547             logical_array_layer = layer;
548             logical_z_offset_px = 0;
549          }
550 
551          uint64_t slice_start_offset_B, slice_end_offset_B;
552          isl_surf_get_image_range_B_tile(isl_surf, level,
553                                          logical_array_layer,
554                                          logical_z_offset_px,
555                                          &slice_start_offset_B,
556                                          &slice_end_offset_B);
557 
558          start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
559          end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
560       }
561 
562       /* Aux operates 64K at a time */
563       start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
564       end_offset_B = align_u64(end_offset_B, 64 * 1024);
565 
566       for (uint64_t offset = start_offset_B;
567            offset < end_offset_B; offset += 64 * 1024) {
568          uint64_t address = base_address + offset;
569 
570          uint64_t aux_entry_addr64, *aux_entry_map;
571          aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
572                                                  address, &aux_entry_addr64);
573 
574          assert(anv_use_softpin(cmd_buffer->device->physical));
575          struct anv_address aux_entry_address = {
576             .bo = NULL,
577             .offset = aux_entry_addr64,
578          };
579 
580          const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
581          uint64_t new_aux_entry =
582             (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits;
583 
584          if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))
585             new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT;
586 
587          mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry));
588       }
589    }
590 
591    anv_add_pending_pipe_bits(cmd_buffer,
592                              ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
593                              "after update AUX-TT");
594 }
595 #endif /* GFX_VER == 12 */
596 
597 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
598  * the initial layout is undefined, the HiZ buffer and depth buffer will
599  * represent the same data at the end of this operation.
600  */
601 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)602 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
603                         const struct anv_image *image,
604                         uint32_t base_layer, uint32_t layer_count,
605                         VkImageLayout initial_layout,
606                         VkImageLayout final_layout,
607                         bool will_full_fast_clear)
608 {
609    const uint32_t depth_plane =
610       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
611    if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
612       return;
613 
614 #if GFX_VER == 12
615    if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
616         initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
617        cmd_buffer->device->physical->has_implicit_ccs &&
618        cmd_buffer->device->info.has_aux_map) {
619       anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
620                             0, 1, base_layer, layer_count);
621    }
622 #endif
623 
624    /* If will_full_fast_clear is set, the caller promises to fast-clear the
625     * largest portion of the specified range as it can.  For depth images,
626     * that means the entire image because we don't support multi-LOD HiZ.
627     */
628    assert(image->planes[0].primary_surface.isl.levels == 1);
629    if (will_full_fast_clear)
630       return;
631 
632    const enum isl_aux_state initial_state =
633       anv_layout_to_aux_state(&cmd_buffer->device->info, image,
634                               VK_IMAGE_ASPECT_DEPTH_BIT,
635                               initial_layout);
636    const enum isl_aux_state final_state =
637       anv_layout_to_aux_state(&cmd_buffer->device->info, image,
638                               VK_IMAGE_ASPECT_DEPTH_BIT,
639                               final_layout);
640 
641    const bool initial_depth_valid =
642       isl_aux_state_has_valid_primary(initial_state);
643    const bool initial_hiz_valid =
644       isl_aux_state_has_valid_aux(initial_state);
645    const bool final_needs_depth =
646       isl_aux_state_has_valid_primary(final_state);
647    const bool final_needs_hiz =
648       isl_aux_state_has_valid_aux(final_state);
649 
650    /* Getting into the pass-through state for Depth is tricky and involves
651     * both a resolve and an ambiguate.  We don't handle that state right now
652     * as anv_layout_to_aux_state never returns it.
653     */
654    assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
655 
656    if (final_needs_depth && !initial_depth_valid) {
657       assert(initial_hiz_valid);
658       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
659                        0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
660    } else if (final_needs_hiz && !initial_hiz_valid) {
661       assert(initial_depth_valid);
662       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
663                        0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
664    }
665 }
666 
667 static inline bool
vk_image_layout_stencil_write_optimal(VkImageLayout layout)668 vk_image_layout_stencil_write_optimal(VkImageLayout layout)
669 {
670    return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
671           layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
672           layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR;
673 }
674 
675 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
676  * the initial layout is undefined, the HiZ buffer and depth buffer will
677  * represent the same data at the end of this operation.
678  */
679 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)680 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
681                           const struct anv_image *image,
682                           uint32_t base_level, uint32_t level_count,
683                           uint32_t base_layer, uint32_t layer_count,
684                           VkImageLayout initial_layout,
685                           VkImageLayout final_layout,
686                           bool will_full_fast_clear)
687 {
688 #if GFX_VER == 7
689    const uint32_t plane =
690       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
691 
692    /* On gfx7, we have to store a texturable version of the stencil buffer in
693     * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
694     * forth at strategic points. Stencil writes are only allowed in following
695     * layouts:
696     *
697     *  - VK_IMAGE_LAYOUT_GENERAL
698     *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
699     *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
700     *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
701     *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR
702     *
703     * For general, we have no nice opportunity to transition so we do the copy
704     * to the shadow unconditionally at the end of the subpass. For transfer
705     * destinations, we can update it as part of the transfer op. For the other
706     * layouts, we delay the copy until a transition into some other layout.
707     */
708    if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
709        vk_image_layout_stencil_write_optimal(initial_layout) &&
710        !vk_image_layout_stencil_write_optimal(final_layout)) {
711       anv_image_copy_to_shadow(cmd_buffer, image,
712                                VK_IMAGE_ASPECT_STENCIL_BIT,
713                                base_level, level_count,
714                                base_layer, layer_count);
715    }
716 #elif GFX_VER == 12
717    const uint32_t plane =
718       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
719    if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
720       return;
721 
722    if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
723         initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
724        cmd_buffer->device->physical->has_implicit_ccs &&
725        cmd_buffer->device->info.has_aux_map) {
726       anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
727                             base_level, level_count, base_layer, layer_count);
728 
729       /* If will_full_fast_clear is set, the caller promises to fast-clear the
730        * largest portion of the specified range as it can.
731        */
732       if (will_full_fast_clear)
733          return;
734 
735       for (uint32_t l = 0; l < level_count; l++) {
736          const uint32_t level = base_level + l;
737          const VkRect2D clear_rect = {
738             .offset.x = 0,
739             .offset.y = 0,
740             .extent.width = anv_minify(image->vk.extent.width, level),
741             .extent.height = anv_minify(image->vk.extent.height, level),
742          };
743 
744          uint32_t aux_layers =
745             anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
746          uint32_t level_layer_count =
747             MIN2(layer_count, aux_layers - base_layer);
748 
749          /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
750           * Enable:
751           *
752           *    "When enabled, Stencil Buffer needs to be initialized via
753           *    stencil clear (HZ_OP) before any renderpass."
754           */
755          anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
756                              level, base_layer, level_layer_count,
757                              clear_rect, 0 /* Stencil clear value */);
758       }
759    }
760 #endif
761 }
762 
763 #define MI_PREDICATE_SRC0    0x2400
764 #define MI_PREDICATE_SRC1    0x2408
765 #define MI_PREDICATE_RESULT  0x2418
766 
767 static void
set_image_compressed_bit(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t base_layer,uint32_t layer_count,bool compressed)768 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
769                          const struct anv_image *image,
770                          VkImageAspectFlagBits aspect,
771                          uint32_t level,
772                          uint32_t base_layer, uint32_t layer_count,
773                          bool compressed)
774 {
775    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
776 
777    /* We only have compression tracking for CCS_E */
778    if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E)
779       return;
780 
781    for (uint32_t a = 0; a < layer_count; a++) {
782       uint32_t layer = base_layer + a;
783       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
784          sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device,
785                                                             image, aspect,
786                                                             level, layer);
787          sdi.ImmediateData = compressed ? UINT32_MAX : 0;
788       }
789    }
790 }
791 
792 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)793 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
794                            const struct anv_image *image,
795                            VkImageAspectFlagBits aspect,
796                            enum anv_fast_clear_type fast_clear)
797 {
798    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
799       sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
800                                                        image, aspect);
801       sdi.ImmediateData = fast_clear;
802    }
803 
804    /* Whenever we have fast-clear, we consider that slice to be compressed.
805     * This makes building predicates much easier.
806     */
807    if (fast_clear != ANV_FAST_CLEAR_NONE)
808       set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
809 }
810 
811 /* This is only really practical on haswell and above because it requires
812  * MI math in order to get it correct.
813  */
814 #if GFX_VERx10 >= 75
815 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)816 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
817                                   const struct anv_image *image,
818                                   VkImageAspectFlagBits aspect,
819                                   uint32_t level, uint32_t array_layer,
820                                   enum isl_aux_op resolve_op,
821                                   enum anv_fast_clear_type fast_clear_supported)
822 {
823    struct mi_builder b;
824    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
825 
826    const struct mi_value fast_clear_type =
827       mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
828                                                   image, aspect));
829 
830    if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
831       /* In this case, we're doing a full resolve which means we want the
832        * resolve to happen if any compression (including fast-clears) is
833        * present.
834        *
835        * In order to simplify the logic a bit, we make the assumption that,
836        * if the first slice has been fast-cleared, it is also marked as
837        * compressed.  See also set_image_fast_clear_state.
838        */
839       const struct mi_value compression_state =
840          mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device,
841                                                        image, aspect,
842                                                        level, array_layer));
843       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
844       mi_store(&b, compression_state, mi_imm(0));
845 
846       if (level == 0 && array_layer == 0) {
847          /* If the predicate is true, we want to write 0 to the fast clear type
848           * and, if it's false, leave it alone.  We can do this by writing
849           *
850           * clear_type = clear_type & ~predicate;
851           */
852          struct mi_value new_fast_clear_type =
853             mi_iand(&b, fast_clear_type,
854                         mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
855          mi_store(&b, fast_clear_type, new_fast_clear_type);
856       }
857    } else if (level == 0 && array_layer == 0) {
858       /* In this case, we are doing a partial resolve to get rid of fast-clear
859        * colors.  We don't care about the compression state but we do care
860        * about how much fast clear is allowed by the final layout.
861        */
862       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
863       assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
864 
865       /* We need to compute (fast_clear_supported < image->fast_clear) */
866       struct mi_value pred =
867          mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
868       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
869 
870       /* If the predicate is true, we want to write 0 to the fast clear type
871        * and, if it's false, leave it alone.  We can do this by writing
872        *
873        * clear_type = clear_type & ~predicate;
874        */
875       struct mi_value new_fast_clear_type =
876          mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
877       mi_store(&b, fast_clear_type, new_fast_clear_type);
878    } else {
879       /* In this case, we're trying to do a partial resolve on a slice that
880        * doesn't have clear color.  There's nothing to do.
881        */
882       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
883       return;
884    }
885 
886    /* Set src1 to 0 and use a != condition */
887    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
888 
889    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
890       mip.LoadOperation    = LOAD_LOADINV;
891       mip.CombineOperation = COMBINE_SET;
892       mip.CompareOperation = COMPARE_SRCS_EQUAL;
893    }
894 }
895 #endif /* GFX_VERx10 >= 75 */
896 
897 #if GFX_VER <= 8
898 static void
anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)899 anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
900                                  const struct anv_image *image,
901                                  VkImageAspectFlagBits aspect,
902                                  uint32_t level, uint32_t array_layer,
903                                  enum isl_aux_op resolve_op,
904                                  enum anv_fast_clear_type fast_clear_supported)
905 {
906    struct mi_builder b;
907    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
908 
909    struct mi_value fast_clear_type_mem =
910       mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
911                                                       image, aspect));
912 
913    /* This only works for partial resolves and only when the clear color is
914     * all or nothing.  On the upside, this emits less command streamer code
915     * and works on Ivybridge and Bay Trail.
916     */
917    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
918    assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
919 
920    /* We don't support fast clears on anything other than the first slice. */
921    if (level > 0 || array_layer > 0)
922       return;
923 
924    /* On gfx8, we don't have a concept of default clear colors because we
925     * can't sample from CCS surfaces.  It's enough to just load the fast clear
926     * state into the predicate register.
927     */
928    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
929    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
930    mi_store(&b, fast_clear_type_mem, mi_imm(0));
931 
932    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
933       mip.LoadOperation    = LOAD_LOADINV;
934       mip.CombineOperation = COMBINE_SET;
935       mip.CompareOperation = COMPARE_SRCS_EQUAL;
936    }
937 }
938 #endif /* GFX_VER <= 8 */
939 
940 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)941 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
942                                const struct anv_image *image,
943                                enum isl_format format,
944                                struct isl_swizzle swizzle,
945                                VkImageAspectFlagBits aspect,
946                                uint32_t level, uint32_t array_layer,
947                                enum isl_aux_op resolve_op,
948                                enum anv_fast_clear_type fast_clear_supported)
949 {
950    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
951 
952 #if GFX_VER >= 9
953    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
954                                      aspect, level, array_layer,
955                                      resolve_op, fast_clear_supported);
956 #else /* GFX_VER <= 8 */
957    anv_cmd_simple_resolve_predicate(cmd_buffer, image,
958                                     aspect, level, array_layer,
959                                     resolve_op, fast_clear_supported);
960 #endif
961 
962    /* CCS_D only supports full resolves and BLORP will assert on us if we try
963     * to do a partial resolve on a CCS_D surface.
964     */
965    if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
966        image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
967       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
968 
969    anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
970                     level, array_layer, 1, resolve_op, NULL, true);
971 }
972 
973 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)974 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
975                                const struct anv_image *image,
976                                enum isl_format format,
977                                struct isl_swizzle swizzle,
978                                VkImageAspectFlagBits aspect,
979                                uint32_t array_layer,
980                                enum isl_aux_op resolve_op,
981                                enum anv_fast_clear_type fast_clear_supported)
982 {
983    assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
984    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
985 
986 #if GFX_VERx10 >= 75
987    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
988                                      aspect, 0, array_layer,
989                                      resolve_op, fast_clear_supported);
990 
991    anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
992                     array_layer, 1, resolve_op, NULL, true);
993 #else
994    unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
995 #endif
996 }
997 
998 void
genX(cmd_buffer_mark_image_written)999 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
1000                                     const struct anv_image *image,
1001                                     VkImageAspectFlagBits aspect,
1002                                     enum isl_aux_usage aux_usage,
1003                                     uint32_t level,
1004                                     uint32_t base_layer,
1005                                     uint32_t layer_count)
1006 {
1007    /* The aspect must be exactly one of the image aspects. */
1008    assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
1009 
1010    /* The only compression types with more than just fast-clears are MCS,
1011     * CCS_E, and HiZ.  With HiZ we just trust the layout and don't actually
1012     * track the current fast-clear and compression state.  This leaves us
1013     * with just MCS and CCS_E.
1014     */
1015    if (aux_usage != ISL_AUX_USAGE_CCS_E &&
1016        aux_usage != ISL_AUX_USAGE_MCS)
1017       return;
1018 
1019    set_image_compressed_bit(cmd_buffer, image, aspect,
1020                             level, base_layer, layer_count, true);
1021 }
1022 
1023 static void
init_fast_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect)1024 init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
1025                       const struct anv_image *image,
1026                       VkImageAspectFlagBits aspect)
1027 {
1028    assert(cmd_buffer && image);
1029    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1030 
1031    set_image_fast_clear_state(cmd_buffer, image, aspect,
1032                               ANV_FAST_CLEAR_NONE);
1033 
1034    /* Initialize the struct fields that are accessed for fast-clears so that
1035     * the HW restrictions on the field values are satisfied.
1036     */
1037    struct anv_address addr =
1038       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
1039 
1040    if (GFX_VER >= 9) {
1041       const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1042       const unsigned num_dwords = GFX_VER >= 10 ?
1043                                   isl_dev->ss.clear_color_state_size / 4 :
1044                                   isl_dev->ss.clear_value_size / 4;
1045       for (unsigned i = 0; i < num_dwords; i++) {
1046          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
1047             sdi.Address = addr;
1048             sdi.Address.offset += i * 4;
1049             sdi.ImmediateData = 0;
1050          }
1051       }
1052    } else {
1053       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
1054          sdi.Address = addr;
1055          if (GFX_VERx10 >= 75) {
1056             /* Pre-SKL, the dword containing the clear values also contains
1057              * other fields, so we need to initialize those fields to match the
1058              * values that would be in a color attachment.
1059              */
1060             sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
1061                                 ISL_CHANNEL_SELECT_GREEN << 22 |
1062                                 ISL_CHANNEL_SELECT_BLUE  << 19 |
1063                                 ISL_CHANNEL_SELECT_ALPHA << 16;
1064          } else if (GFX_VER == 7) {
1065             /* On IVB, the dword containing the clear values also contains
1066              * other fields that must be zero or can be zero.
1067              */
1068             sdi.ImmediateData = 0;
1069          }
1070       }
1071    }
1072 }
1073 
1074 /* Copy the fast-clear value dword(s) between a surface state object and an
1075  * image's fast clear state buffer.
1076  */
1077 static void
genX(copy_fast_clear_dwords)1078 genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
1079                              struct anv_state surface_state,
1080                              const struct anv_image *image,
1081                              VkImageAspectFlagBits aspect,
1082                              bool copy_from_surface_state)
1083 {
1084    assert(cmd_buffer && image);
1085    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1086 
1087    struct anv_address ss_clear_addr = {
1088       .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
1089       .offset = surface_state.offset +
1090                 cmd_buffer->device->isl_dev.ss.clear_value_offset,
1091    };
1092    const struct anv_address entry_addr =
1093       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
1094    unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
1095 
1096 #if GFX_VER == 7
1097    /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
1098     * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
1099     * in-flight when they are issued even if the memory touched is not
1100     * currently active for rendering.  The weird bit is that it is not the
1101     * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
1102     * rendering hangs such that the next stalling command after the
1103     * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
1104     *
1105     * It is unclear exactly why this hang occurs.  Both MI commands come with
1106     * warnings about the 3D pipeline but that doesn't seem to fully explain
1107     * it.  My (Jason's) best theory is that it has something to do with the
1108     * fact that we're using a GPU state register as our temporary and that
1109     * something with reading/writing it is causing problems.
1110     *
1111     * In order to work around this issue, we emit a PIPE_CONTROL with the
1112     * command streamer stall bit set.
1113     */
1114    anv_add_pending_pipe_bits(cmd_buffer,
1115                              ANV_PIPE_CS_STALL_BIT,
1116                              "after copy_fast_clear_dwords. Avoid potential hang");
1117    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1118 #endif
1119 
1120    struct mi_builder b;
1121    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1122 
1123    if (copy_from_surface_state) {
1124       mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
1125    } else {
1126       mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
1127 
1128       /* Updating a surface state object may require that the state cache be
1129        * invalidated. From the SKL PRM, Shared Functions -> State -> State
1130        * Caching:
1131        *
1132        *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
1133        *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
1134        *    modified [...], the L1 state cache must be invalidated to ensure
1135        *    the new surface or sampler state is fetched from system memory.
1136        *
1137        * In testing, SKL doesn't actually seem to need this, but HSW does.
1138        */
1139       anv_add_pending_pipe_bits(cmd_buffer,
1140                                 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
1141                                 "after copy_fast_clear_dwords surface state update");
1142    }
1143 }
1144 
1145 /**
1146  * @brief Transitions a color buffer from one layout to another.
1147  *
1148  * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
1149  * more information.
1150  *
1151  * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
1152  * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
1153  *                    this represents the maximum layers to transition at each
1154  *                    specified miplevel.
1155  */
1156 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint64_t src_queue_family,uint64_t dst_queue_family,bool will_full_fast_clear)1157 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
1158                         const struct anv_image *image,
1159                         VkImageAspectFlagBits aspect,
1160                         const uint32_t base_level, uint32_t level_count,
1161                         uint32_t base_layer, uint32_t layer_count,
1162                         VkImageLayout initial_layout,
1163                         VkImageLayout final_layout,
1164                         uint64_t src_queue_family,
1165                         uint64_t dst_queue_family,
1166                         bool will_full_fast_clear)
1167 {
1168    struct anv_device *device = cmd_buffer->device;
1169    const struct intel_device_info *devinfo = &device->info;
1170    /* Validate the inputs. */
1171    assert(cmd_buffer);
1172    assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1173    /* These values aren't supported for simplicity's sake. */
1174    assert(level_count != VK_REMAINING_MIP_LEVELS &&
1175           layer_count != VK_REMAINING_ARRAY_LAYERS);
1176    /* Ensure the subresource range is valid. */
1177    UNUSED uint64_t last_level_num = base_level + level_count;
1178    const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level);
1179    UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
1180    assert((uint64_t)base_layer + layer_count  <= image_layers);
1181    assert(last_level_num <= image->vk.mip_levels);
1182    /* If there is a layout transfer, the final layout cannot be undefined or
1183     * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
1184     */
1185    assert(initial_layout == final_layout ||
1186           (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
1187            final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
1188    const struct isl_drm_modifier_info *isl_mod_info =
1189       image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
1190       ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
1191       : NULL;
1192 
1193    const bool src_queue_external =
1194       src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1195       src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1196 
1197    const bool dst_queue_external =
1198       dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1199       dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1200 
1201    /* Simultaneous acquire and release on external queues is illegal. */
1202    assert(!src_queue_external || !dst_queue_external);
1203 
1204    /* Ownership transition on an external queue requires special action if the
1205     * image has a DRM format modifier because we store image data in
1206     * a driver-private bo which is inaccessible to the external queue.
1207     */
1208    const bool mod_acquire =
1209       src_queue_external &&
1210       image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
1211 
1212    const bool mod_release =
1213       dst_queue_external &&
1214       image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
1215 
1216    if (initial_layout == final_layout &&
1217        !mod_acquire && !mod_release) {
1218       /* No work is needed. */
1219        return;
1220    }
1221 
1222    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
1223 
1224    if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
1225        final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
1226       /* This surface is a linear compressed image with a tiled shadow surface
1227        * for texturing.  The client is about to use it in READ_ONLY_OPTIMAL so
1228        * we need to ensure the shadow copy is up-to-date.
1229        */
1230       assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
1231       assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
1232       assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
1233       assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
1234       assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
1235       assert(plane == 0);
1236       anv_image_copy_to_shadow(cmd_buffer, image,
1237                                VK_IMAGE_ASPECT_COLOR_BIT,
1238                                base_level, level_count,
1239                                base_layer, layer_count);
1240    }
1241 
1242    if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
1243       return;
1244 
1245    assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
1246 
1247    /* The following layouts are equivalent for non-linear images. */
1248    const bool initial_layout_undefined =
1249       initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
1250       initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
1251 
1252    bool must_init_fast_clear_state = false;
1253    bool must_init_aux_surface = false;
1254 
1255    if (initial_layout_undefined) {
1256       /* The subresource may have been aliased and populated with arbitrary
1257        * data.
1258        */
1259       must_init_fast_clear_state = true;
1260       must_init_aux_surface = true;
1261    } else if (mod_acquire) {
1262       /* The fast clear state lives in a driver-private bo, and therefore the
1263        * external/foreign queue is unaware of it.
1264        *
1265        * If this is the first time we are accessing the image, then the fast
1266        * clear state is uninitialized.
1267        *
1268        * If this is NOT the first time we are accessing the image, then the fast
1269        * clear state may still be valid and correct due to the resolve during
1270        * our most recent ownership release.  However, we do not track the aux
1271        * state with MI stores, and therefore must assume the worst-case: that
1272        * this is the first time we are accessing the image.
1273        */
1274       assert(image->planes[plane].fast_clear_memory_range.binding ==
1275               ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1276       must_init_fast_clear_state = true;
1277 
1278       if (image->planes[plane].aux_surface.memory_range.binding ==
1279           ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
1280          assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE);
1281 
1282          /* The aux surface, like the fast clear state, lives in
1283           * a driver-private bo.  We must initialize the aux surface for the
1284           * same reasons we must initialize the fast clear state.
1285           */
1286          must_init_aux_surface = true;
1287       } else {
1288          assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE);
1289 
1290          /* The aux surface, unlike the fast clear state, lives in
1291           * application-visible VkDeviceMemory and is shared with the
1292           * external/foreign queue. Therefore, when we acquire ownership of the
1293           * image with a defined VkImageLayout, the aux surface is valid and has
1294           * the aux state required by the modifier.
1295           */
1296          must_init_aux_surface = false;
1297       }
1298    }
1299 
1300 #if GFX_VER == 12
1301    /* We do not yet support modifiers with aux on gen12. */
1302    assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
1303 
1304    if (initial_layout_undefined) {
1305       if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {
1306          anv_image_init_aux_tt(cmd_buffer, image, aspect,
1307                                base_level, level_count,
1308                                base_layer, layer_count);
1309       }
1310    }
1311 #else
1312    assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
1313 #endif
1314 
1315    if (must_init_fast_clear_state) {
1316       if (base_level == 0 && base_layer == 0)
1317          init_fast_clear_color(cmd_buffer, image, aspect);
1318    }
1319 
1320    if (must_init_aux_surface) {
1321       assert(must_init_fast_clear_state);
1322 
1323       /* Initialize the aux buffers to enable correct rendering.  In order to
1324        * ensure that things such as storage images work correctly, aux buffers
1325        * need to be initialized to valid data.
1326        *
1327        * Having an aux buffer with invalid data is a problem for two reasons:
1328        *
1329        *  1) Having an invalid value in the buffer can confuse the hardware.
1330        *     For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1331        *     invalid and leads to the hardware doing strange things.  It
1332        *     doesn't hang as far as we can tell but rendering corruption can
1333        *     occur.
1334        *
1335        *  2) If this transition is into the GENERAL layout and we then use the
1336        *     image as a storage image, then we must have the aux buffer in the
1337        *     pass-through state so that, if we then go to texture from the
1338        *     image, we get the results of our storage image writes and not the
1339        *     fast clear color or other random data.
1340        *
1341        * For CCS both of the problems above are real demonstrable issues.  In
1342        * that case, the only thing we can do is to perform an ambiguate to
1343        * transition the aux surface into the pass-through state.
1344        *
1345        * For MCS, (2) is never an issue because we don't support multisampled
1346        * storage images.  In theory, issue (1) is a problem with MCS but we've
1347        * never seen it in the wild.  For 4x and 16x, all bit patters could, in
1348        * theory, be interpreted as something but we don't know that all bit
1349        * patterns are actually valid.  For 2x and 8x, you could easily end up
1350        * with the MCS referring to an invalid plane because not all bits of
1351        * the MCS value are actually used.  Even though we've never seen issues
1352        * in the wild, it's best to play it safe and initialize the MCS.  We
1353        * can use a fast-clear for MCS because we only ever touch from render
1354        * and texture (no image load store).
1355        */
1356       if (image->vk.samples == 1) {
1357          for (uint32_t l = 0; l < level_count; l++) {
1358             const uint32_t level = base_level + l;
1359 
1360             uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1361             if (base_layer >= aux_layers)
1362                break; /* We will only get fewer layers as level increases */
1363             uint32_t level_layer_count =
1364                MIN2(layer_count, aux_layers - base_layer);
1365 
1366             /* If will_full_fast_clear is set, the caller promises to
1367              * fast-clear the largest portion of the specified range as it can.
1368              * For color images, that means only the first LOD and array slice.
1369              */
1370             if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1371                base_layer++;
1372                level_layer_count--;
1373                if (level_layer_count == 0)
1374                   continue;
1375             }
1376 
1377             anv_image_ccs_op(cmd_buffer, image,
1378                              image->planes[plane].primary_surface.isl.format,
1379                              ISL_SWIZZLE_IDENTITY,
1380                              aspect, level, base_layer, level_layer_count,
1381                              ISL_AUX_OP_AMBIGUATE, NULL, false);
1382 
1383             if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
1384                set_image_compressed_bit(cmd_buffer, image, aspect,
1385                                         level, base_layer, level_layer_count,
1386                                         false);
1387             }
1388          }
1389       } else {
1390          if (image->vk.samples == 4 || image->vk.samples == 16) {
1391             anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
1392                           "Doing a potentially unnecessary fast-clear to "
1393                           "define an MCS buffer.");
1394          }
1395 
1396          /* If will_full_fast_clear is set, the caller promises to fast-clear
1397           * the largest portion of the specified range as it can.
1398           */
1399          if (will_full_fast_clear)
1400             return;
1401 
1402          assert(base_level == 0 && level_count == 1);
1403          anv_image_mcs_op(cmd_buffer, image,
1404                           image->planes[plane].primary_surface.isl.format,
1405                           ISL_SWIZZLE_IDENTITY,
1406                           aspect, base_layer, layer_count,
1407                           ISL_AUX_OP_FAST_CLEAR, NULL, false);
1408       }
1409       return;
1410    }
1411 
1412    enum isl_aux_usage initial_aux_usage =
1413       anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
1414    enum isl_aux_usage final_aux_usage =
1415       anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
1416 
1417    /* We must override the anv_layout_to_* functions because they are unaware of
1418     * acquire/release direction.
1419     */
1420    if (mod_acquire) {
1421       initial_aux_usage = isl_mod_info->aux_usage;
1422    } else if (mod_release) {
1423       final_aux_usage = isl_mod_info->aux_usage;
1424    }
1425 
1426    /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1427     * We can handle transitions between CCS_D/E to and from NONE.  What we
1428     * don't yet handle is switching between CCS_E and CCS_D within a given
1429     * image.  Doing so in a performant way requires more detailed aux state
1430     * tracking such as what is done in i965.  For now, just assume that we
1431     * only have one type of compression.
1432     */
1433    assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1434           final_aux_usage == ISL_AUX_USAGE_NONE ||
1435           initial_aux_usage == final_aux_usage);
1436 
1437    /* If initial aux usage is NONE, there is nothing to resolve */
1438    if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1439       return;
1440 
1441    enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1442 
1443    /* If the initial layout supports more fast clear than the final layout
1444     * then we need at least a partial resolve.
1445     */
1446    const enum anv_fast_clear_type initial_fast_clear =
1447       anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
1448    const enum anv_fast_clear_type final_fast_clear =
1449       anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
1450    if (final_fast_clear < initial_fast_clear)
1451       resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1452 
1453    if (initial_aux_usage == ISL_AUX_USAGE_CCS_E &&
1454        final_aux_usage != ISL_AUX_USAGE_CCS_E)
1455       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
1456 
1457    if (resolve_op == ISL_AUX_OP_NONE)
1458       return;
1459 
1460    /* Perform a resolve to synchronize data between the main and aux buffer.
1461     * Before we begin, we must satisfy the cache flushing requirement specified
1462     * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1463     *
1464     *    Any transition from any value in {Clear, Render, Resolve} to a
1465     *    different value in {Clear, Render, Resolve} requires end of pipe
1466     *    synchronization.
1467     *
1468     * We perform a flush of the write cache before and after the clear and
1469     * resolve operations to meet this requirement.
1470     *
1471     * Unlike other drawing, fast clear operations are not properly
1472     * synchronized. The first PIPE_CONTROL here likely ensures that the
1473     * contents of the previous render or clear hit the render target before we
1474     * resolve and the second likely ensures that the resolve is complete before
1475     * we do any more rendering or clearing.
1476     */
1477    anv_add_pending_pipe_bits(cmd_buffer,
1478                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1479                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1480                              "after transition RT");
1481 
1482    for (uint32_t l = 0; l < level_count; l++) {
1483       uint32_t level = base_level + l;
1484 
1485       uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1486       if (base_layer >= aux_layers)
1487          break; /* We will only get fewer layers as level increases */
1488       uint32_t level_layer_count =
1489          MIN2(layer_count, aux_layers - base_layer);
1490 
1491       for (uint32_t a = 0; a < level_layer_count; a++) {
1492          uint32_t array_layer = base_layer + a;
1493 
1494          /* If will_full_fast_clear is set, the caller promises to fast-clear
1495           * the largest portion of the specified range as it can.  For color
1496           * images, that means only the first LOD and array slice.
1497           */
1498          if (level == 0 && array_layer == 0 && will_full_fast_clear)
1499             continue;
1500 
1501          if (image->vk.samples == 1) {
1502             anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1503                                            image->planes[plane].primary_surface.isl.format,
1504                                            ISL_SWIZZLE_IDENTITY,
1505                                            aspect, level, array_layer, resolve_op,
1506                                            final_fast_clear);
1507          } else {
1508             /* We only support fast-clear on the first layer so partial
1509              * resolves should not be used on other layers as they will use
1510              * the clear color stored in memory that is only valid for layer0.
1511              */
1512             if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1513                 array_layer != 0)
1514                continue;
1515 
1516             anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1517                                            image->planes[plane].primary_surface.isl.format,
1518                                            ISL_SWIZZLE_IDENTITY,
1519                                            aspect, array_layer, resolve_op,
1520                                            final_fast_clear);
1521          }
1522       }
1523    }
1524 
1525    anv_add_pending_pipe_bits(cmd_buffer,
1526                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1527                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1528                              "after transition RT");
1529 }
1530 
1531 static VkResult
genX(cmd_buffer_setup_attachments)1532 genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer,
1533                                    const struct anv_render_pass *pass,
1534                                    const struct anv_framebuffer *framebuffer,
1535                                    const VkRenderPassBeginInfo *begin)
1536 {
1537    struct anv_cmd_state *state = &cmd_buffer->state;
1538 
1539    vk_free(&cmd_buffer->pool->alloc, state->attachments);
1540 
1541    if (pass->attachment_count > 0) {
1542       state->attachments = vk_zalloc(&cmd_buffer->pool->alloc,
1543                                      pass->attachment_count *
1544                                           sizeof(state->attachments[0]),
1545                                      8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1546       if (state->attachments == NULL) {
1547          /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
1548          return anv_batch_set_error(&cmd_buffer->batch,
1549                                     VK_ERROR_OUT_OF_HOST_MEMORY);
1550       }
1551    } else {
1552       state->attachments = NULL;
1553    }
1554 
1555    const VkRenderPassAttachmentBeginInfoKHR *attach_begin =
1556       vk_find_struct_const(begin, RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR);
1557    if (begin && !attach_begin)
1558       assert(pass->attachment_count == framebuffer->attachment_count);
1559 
1560    for (uint32_t i = 0; i < pass->attachment_count; ++i) {
1561       if (attach_begin && attach_begin->attachmentCount != 0) {
1562          assert(attach_begin->attachmentCount == pass->attachment_count);
1563          ANV_FROM_HANDLE(anv_image_view, iview, attach_begin->pAttachments[i]);
1564          state->attachments[i].image_view = iview;
1565       } else if (framebuffer && i < framebuffer->attachment_count) {
1566          state->attachments[i].image_view = framebuffer->attachments[i];
1567       } else {
1568          state->attachments[i].image_view = NULL;
1569       }
1570    }
1571 
1572    if (begin) {
1573       for (uint32_t i = 0; i < pass->attachment_count; ++i) {
1574          const struct anv_render_pass_attachment *pass_att = &pass->attachments[i];
1575          struct anv_attachment_state *att_state = &state->attachments[i];
1576          VkImageAspectFlags att_aspects = vk_format_aspects(pass_att->format);
1577          VkImageAspectFlags clear_aspects = 0;
1578          VkImageAspectFlags load_aspects = 0;
1579 
1580          if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
1581             /* color attachment */
1582             if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1583                clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
1584             } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
1585                load_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
1586             }
1587          } else {
1588             /* depthstencil attachment */
1589             if (att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
1590                if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1591                   clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
1592                } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
1593                   load_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
1594                }
1595             }
1596             if (att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
1597                if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1598                   clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
1599                } else if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
1600                   load_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
1601                }
1602             }
1603          }
1604 
1605          att_state->current_layout = pass_att->initial_layout;
1606          att_state->current_stencil_layout = pass_att->stencil_initial_layout;
1607          att_state->pending_clear_aspects = clear_aspects;
1608          att_state->pending_load_aspects = load_aspects;
1609          if (clear_aspects)
1610             att_state->clear_value = begin->pClearValues[i];
1611 
1612          struct anv_image_view *iview = state->attachments[i].image_view;
1613 
1614          const uint32_t num_layers = iview->planes[0].isl.array_len;
1615          att_state->pending_clear_views = (1 << num_layers) - 1;
1616 
1617          /* This will be initialized after the first subpass transition. */
1618          att_state->aux_usage = ISL_AUX_USAGE_NONE;
1619 
1620          att_state->fast_clear = false;
1621          if (clear_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
1622             assert(clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT);
1623             att_state->fast_clear =
1624                anv_can_fast_clear_color_view(cmd_buffer->device, iview,
1625                                              pass_att->first_subpass_layout,
1626                                              vk_to_isl_color(att_state->clear_value.color),
1627                                              framebuffer->layers,
1628                                              begin->renderArea);
1629          } else if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
1630                                      VK_IMAGE_ASPECT_STENCIL_BIT)) {
1631             att_state->fast_clear =
1632                anv_can_hiz_clear_ds_view(cmd_buffer->device, iview,
1633                                          pass_att->first_subpass_layout,
1634                                          clear_aspects,
1635                                          att_state->clear_value.depthStencil.depth,
1636                                          begin->renderArea);
1637          }
1638       }
1639    }
1640 
1641    return VK_SUCCESS;
1642 }
1643 
1644 /**
1645  * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass.
1646  */
1647 static VkResult
genX(cmd_buffer_alloc_att_surf_states)1648 genX(cmd_buffer_alloc_att_surf_states)(struct anv_cmd_buffer *cmd_buffer,
1649                                        const struct anv_render_pass *pass,
1650                                        const struct anv_subpass *subpass)
1651 {
1652    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1653    struct anv_cmd_state *state = &cmd_buffer->state;
1654 
1655    /* Reserve one for the NULL state. */
1656    unsigned num_states = 1;
1657    for (uint32_t i = 0; i < subpass->attachment_count; i++) {
1658       uint32_t att = subpass->attachments[i].attachment;
1659       if (att == VK_ATTACHMENT_UNUSED)
1660          continue;
1661 
1662       assert(att < pass->attachment_count);
1663       if (!vk_format_is_color(pass->attachments[att].format))
1664          continue;
1665 
1666       const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;
1667       assert(util_bitcount(att_usage) == 1);
1668 
1669       if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT ||
1670           att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
1671          num_states++;
1672    }
1673 
1674    const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
1675    state->attachment_states =
1676       anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
1677                              num_states * ss_stride, isl_dev->ss.align);
1678    if (state->attachment_states.map == NULL) {
1679       return anv_batch_set_error(&cmd_buffer->batch,
1680                                  VK_ERROR_OUT_OF_DEVICE_MEMORY);
1681    }
1682 
1683    struct anv_state next_state = state->attachment_states;
1684    next_state.alloc_size = isl_dev->ss.size;
1685 
1686    state->null_surface_state = next_state;
1687    next_state.offset += ss_stride;
1688    next_state.map += ss_stride;
1689 
1690    for (uint32_t i = 0; i < subpass->attachment_count; i++) {
1691       uint32_t att = subpass->attachments[i].attachment;
1692       if (att == VK_ATTACHMENT_UNUSED)
1693          continue;
1694 
1695       assert(att < pass->attachment_count);
1696       if (!vk_format_is_color(pass->attachments[att].format))
1697          continue;
1698 
1699       const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;
1700       assert(util_bitcount(att_usage) == 1);
1701 
1702       if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
1703          state->attachments[att].color.state = next_state;
1704       else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
1705          state->attachments[att].input.state = next_state;
1706       else
1707          continue;
1708 
1709       state->attachments[att].color.state = next_state;
1710       next_state.offset += ss_stride;
1711       next_state.map += ss_stride;
1712    }
1713 
1714    assert(next_state.offset == state->attachment_states.offset +
1715                                state->attachment_states.alloc_size);
1716 
1717    return VK_SUCCESS;
1718 }
1719 
1720 VkResult
genX(BeginCommandBuffer)1721 genX(BeginCommandBuffer)(
1722     VkCommandBuffer                             commandBuffer,
1723     const VkCommandBufferBeginInfo*             pBeginInfo)
1724 {
1725    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1726 
1727    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
1728     * command buffer's state. Otherwise, we must *reset* its state. In both
1729     * cases we reset it.
1730     *
1731     * From the Vulkan 1.0 spec:
1732     *
1733     *    If a command buffer is in the executable state and the command buffer
1734     *    was allocated from a command pool with the
1735     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
1736     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
1737     *    as if vkResetCommandBuffer had been called with
1738     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
1739     *    the command buffer in the recording state.
1740     */
1741    anv_cmd_buffer_reset(cmd_buffer);
1742 
1743    cmd_buffer->usage_flags = pBeginInfo->flags;
1744 
1745    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
1746     * primary level command buffers.
1747     *
1748     * From the Vulkan 1.0 spec:
1749     *
1750     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
1751     *    secondary command buffer is considered to be entirely inside a render
1752     *    pass. If this is a primary command buffer, then this bit is ignored.
1753     */
1754    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1755       cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1756 
1757    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1758 
1759    /* We sometimes store vertex data in the dynamic state buffer for blorp
1760     * operations and our dynamic state stream may re-use data from previous
1761     * command buffers.  In order to prevent stale cache data, we flush the VF
1762     * cache.  We could do this on every blorp call but that's not really
1763     * needed as all of the data will get written by the CPU prior to the GPU
1764     * executing anything.  The chances are fairly high that they will use
1765     * blorp at least once per primary command buffer so it shouldn't be
1766     * wasted.
1767     *
1768     * There is also a workaround on gfx8 which requires us to invalidate the
1769     * VF cache occasionally.  It's easier if we can assume we start with a
1770     * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
1771     */
1772    anv_add_pending_pipe_bits(cmd_buffer,
1773                              ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1774                              "new cmd buffer");
1775 
1776    /* Re-emit the aux table register in every command buffer.  This way we're
1777     * ensured that we have the table even if this command buffer doesn't
1778     * initialize any images.
1779     */
1780    if (cmd_buffer->device->info.has_aux_map) {
1781       anv_add_pending_pipe_bits(cmd_buffer,
1782                                 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
1783                                 "new cmd buffer with aux-tt");
1784    }
1785 
1786    /* We send an "Indirect State Pointers Disable" packet at
1787     * EndCommandBuffer, so all push contant packets are ignored during a
1788     * context restore. Documentation says after that command, we need to
1789     * emit push constants again before any rendering operation. So we
1790     * flag them dirty here to make sure they get emitted.
1791     */
1792    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
1793 
1794    VkResult result = VK_SUCCESS;
1795    if (cmd_buffer->usage_flags &
1796        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1797       assert(pBeginInfo->pInheritanceInfo);
1798       ANV_FROM_HANDLE(anv_render_pass, pass,
1799                       pBeginInfo->pInheritanceInfo->renderPass);
1800       struct anv_subpass *subpass =
1801          &pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
1802       ANV_FROM_HANDLE(anv_framebuffer, framebuffer,
1803                       pBeginInfo->pInheritanceInfo->framebuffer);
1804 
1805       cmd_buffer->state.pass = pass;
1806       cmd_buffer->state.subpass = subpass;
1807 
1808       /* This is optional in the inheritance info. */
1809       cmd_buffer->state.framebuffer = framebuffer;
1810 
1811       result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass,
1812                                                   framebuffer, NULL);
1813       if (result != VK_SUCCESS)
1814          return result;
1815 
1816       result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer, pass,
1817                                                       subpass);
1818       if (result != VK_SUCCESS)
1819          return result;
1820 
1821       /* Record that HiZ is enabled if we can. */
1822       if (cmd_buffer->state.framebuffer) {
1823          const struct anv_image_view * const iview =
1824             anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
1825 
1826          if (iview) {
1827             VkImageLayout layout =
1828                 cmd_buffer->state.subpass->depth_stencil_attachment->layout;
1829 
1830             enum isl_aux_usage aux_usage =
1831                anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image,
1832                                        VK_IMAGE_ASPECT_DEPTH_BIT,
1833                                        VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
1834                                        layout);
1835 
1836             cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(aux_usage);
1837          }
1838       }
1839 
1840       cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1841    }
1842 
1843 #if GFX_VERx10 >= 75
1844    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1845       const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
1846          vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
1847 
1848       /* If secondary buffer supports conditional rendering
1849        * we should emit commands as if conditional rendering is enabled.
1850        */
1851       cmd_buffer->state.conditional_render_enabled =
1852          conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
1853    }
1854 #endif
1855 
1856    return result;
1857 }
1858 
1859 /* From the PRM, Volume 2a:
1860  *
1861  *    "Indirect State Pointers Disable
1862  *
1863  *    At the completion of the post-sync operation associated with this pipe
1864  *    control packet, the indirect state pointers in the hardware are
1865  *    considered invalid; the indirect pointers are not saved in the context.
1866  *    If any new indirect state commands are executed in the command stream
1867  *    while the pipe control is pending, the new indirect state commands are
1868  *    preserved.
1869  *
1870  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
1871  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
1872  *    commands are only considered as Indirect State Pointers. Once ISP is
1873  *    issued in a context, SW must initialize by programming push constant
1874  *    commands for all the shaders (at least to zero length) before attempting
1875  *    any rendering operation for the same context."
1876  *
1877  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
1878  * even though they point to a BO that has been already unreferenced at
1879  * the end of the previous batch buffer. This has been fine so far since
1880  * we are protected by these scratch page (every address not covered by
1881  * a BO should be pointing to the scratch page). But on CNL, it is
1882  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
1883  * instruction.
1884  *
1885  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
1886  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
1887  * context restore, so the mentioned hang doesn't happen. However,
1888  * software must program push constant commands for all stages prior to
1889  * rendering anything. So we flag them dirty in BeginCommandBuffer.
1890  *
1891  * Finally, we also make sure to stall at pixel scoreboard to make sure the
1892  * constants have been loaded into the EUs prior to disable the push constants
1893  * so that it doesn't hang a previous 3DPRIMITIVE.
1894  */
1895 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)1896 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
1897 {
1898    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1899          pc.StallAtPixelScoreboard = true;
1900          pc.CommandStreamerStallEnable = true;
1901          anv_debug_dump_pc(pc);
1902    }
1903    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1904          pc.IndirectStatePointersDisable = true;
1905          pc.CommandStreamerStallEnable = true;
1906          anv_debug_dump_pc(pc);
1907    }
1908 }
1909 
1910 VkResult
genX(EndCommandBuffer)1911 genX(EndCommandBuffer)(
1912     VkCommandBuffer                             commandBuffer)
1913 {
1914    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1915 
1916    if (anv_batch_has_error(&cmd_buffer->batch))
1917       return cmd_buffer->batch.status;
1918 
1919    anv_measure_endcommandbuffer(cmd_buffer);
1920 
1921    /* We want every command buffer to start with the PMA fix in a known state,
1922     * so we disable it at the end of the command buffer.
1923     */
1924    genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
1925 
1926    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1927 
1928    emit_isp_disable(cmd_buffer);
1929 
1930    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
1931 
1932    return VK_SUCCESS;
1933 }
1934 
1935 void
genX(CmdExecuteCommands)1936 genX(CmdExecuteCommands)(
1937     VkCommandBuffer                             commandBuffer,
1938     uint32_t                                    commandBufferCount,
1939     const VkCommandBuffer*                      pCmdBuffers)
1940 {
1941    ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
1942 
1943    assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1944 
1945    if (anv_batch_has_error(&primary->batch))
1946       return;
1947 
1948    /* The secondary command buffers will assume that the PMA fix is disabled
1949     * when they begin executing.  Make sure this is true.
1950     */
1951    genX(cmd_buffer_enable_pma_fix)(primary, false);
1952 
1953    /* The secondary command buffer doesn't know which textures etc. have been
1954     * flushed prior to their execution.  Apply those flushes now.
1955     */
1956    genX(cmd_buffer_apply_pipe_flushes)(primary);
1957 
1958    for (uint32_t i = 0; i < commandBufferCount; i++) {
1959       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
1960 
1961       assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1962       assert(!anv_batch_has_error(&secondary->batch));
1963 
1964 #if GFX_VERx10 >= 75
1965       if (secondary->state.conditional_render_enabled) {
1966          if (!primary->state.conditional_render_enabled) {
1967             /* Secondary buffer is constructed as if it will be executed
1968              * with conditional rendering, we should satisfy this dependency
1969              * regardless of conditional rendering being enabled in primary.
1970              */
1971             struct mi_builder b;
1972             mi_builder_init(&b, &primary->device->info, &primary->batch);
1973             mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
1974                          mi_imm(UINT64_MAX));
1975          }
1976       }
1977 #endif
1978 
1979       if (secondary->usage_flags &
1980           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1981          /* If we're continuing a render pass from the primary, we need to
1982           * copy the surface states for the current subpass into the storage
1983           * we allocated for them in BeginCommandBuffer.
1984           */
1985          struct anv_bo *ss_bo =
1986             primary->device->surface_state_pool.block_pool.bo;
1987          struct anv_state src_state = primary->state.attachment_states;
1988          struct anv_state dst_state = secondary->state.attachment_states;
1989          assert(src_state.alloc_size == dst_state.alloc_size);
1990 
1991          genX(cmd_buffer_so_memcpy)(primary,
1992                                     (struct anv_address) {
1993                                        .bo = ss_bo,
1994                                        .offset = dst_state.offset,
1995                                     },
1996                                     (struct anv_address) {
1997                                        .bo = ss_bo,
1998                                        .offset = src_state.offset,
1999                                     },
2000                                     src_state.alloc_size);
2001       }
2002 
2003       anv_cmd_buffer_add_secondary(primary, secondary);
2004 
2005       assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
2006              secondary->perf_query_pool == primary->perf_query_pool);
2007       if (secondary->perf_query_pool)
2008          primary->perf_query_pool = secondary->perf_query_pool;
2009 
2010 #if GFX_VERx10 == 120
2011       if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
2012          primary->state.depth_reg_mode = secondary->state.depth_reg_mode;
2013 #endif
2014    }
2015 
2016    /* The secondary isn't counted in our VF cache tracking so we need to
2017     * invalidate the whole thing.
2018     */
2019    if (GFX_VER >= 8 && GFX_VER <= 9) {
2020       anv_add_pending_pipe_bits(primary,
2021                                 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
2022                                 "Secondary cmd buffer not tracked in VF cache");
2023    }
2024 
2025    /* The secondary may have selected a different pipeline (3D or compute) and
2026     * may have changed the current L3$ configuration.  Reset our tracking
2027     * variables to invalid values to ensure that we re-emit these in the case
2028     * where we do any draws or compute dispatches from the primary after the
2029     * secondary has returned.
2030     */
2031    primary->state.current_pipeline = UINT32_MAX;
2032    primary->state.current_l3_config = NULL;
2033    primary->state.current_hash_scale = 0;
2034 
2035    /* Each of the secondary command buffers will use its own state base
2036     * address.  We need to re-emit state base address for the primary after
2037     * all of the secondaries are done.
2038     *
2039     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
2040     * address calls?
2041     */
2042    genX(cmd_buffer_emit_state_base_address)(primary);
2043 }
2044 
2045 /**
2046  * Program the hardware to use the specified L3 configuration.
2047  */
2048 void
genX(cmd_buffer_config_l3)2049 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
2050                            const struct intel_l3_config *cfg)
2051 {
2052    assert(cfg || GFX_VER >= 12);
2053    if (cfg == cmd_buffer->state.current_l3_config)
2054       return;
2055 
2056 #if GFX_VER >= 11
2057    /* On Gfx11+ we use only one config, so verify it remains the same and skip
2058     * the stalling programming entirely.
2059     */
2060    assert(cfg == cmd_buffer->device->l3_config);
2061 #else
2062    if (INTEL_DEBUG(DEBUG_L3)) {
2063       mesa_logd("L3 config transition: ");
2064       intel_dump_l3_config(cfg, stderr);
2065    }
2066 
2067    /* According to the hardware docs, the L3 partitioning can only be changed
2068     * while the pipeline is completely drained and the caches are flushed,
2069     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
2070     */
2071    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2072       pc.DCFlushEnable = true;
2073       pc.PostSyncOperation = NoWrite;
2074       pc.CommandStreamerStallEnable = true;
2075       anv_debug_dump_pc(pc);
2076    }
2077 
2078    /* ...followed by a second pipelined PIPE_CONTROL that initiates
2079     * invalidation of the relevant caches.  Note that because RO invalidation
2080     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
2081     * command is processed by the CS) we cannot combine it with the previous
2082     * stalling flush as the hardware documentation suggests, because that
2083     * would cause the CS to stall on previous rendering *after* RO
2084     * invalidation and wouldn't prevent the RO caches from being polluted by
2085     * concurrent rendering before the stall completes.  This intentionally
2086     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
2087     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
2088     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
2089     * already guarantee that there is no concurrent GPGPU kernel execution
2090     * (see SKL HSD 2132585).
2091     */
2092    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2093       pc.TextureCacheInvalidationEnable = true;
2094       pc.ConstantCacheInvalidationEnable = true;
2095       pc.InstructionCacheInvalidateEnable = true;
2096       pc.StateCacheInvalidationEnable = true;
2097       pc.PostSyncOperation = NoWrite;
2098       anv_debug_dump_pc(pc);
2099    }
2100 
2101    /* Now send a third stalling flush to make sure that invalidation is
2102     * complete when the L3 configuration registers are modified.
2103     */
2104    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2105       pc.DCFlushEnable = true;
2106       pc.PostSyncOperation = NoWrite;
2107       pc.CommandStreamerStallEnable = true;
2108       anv_debug_dump_pc(pc);
2109    }
2110 
2111    genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
2112 #endif /* GFX_VER >= 11 */
2113    cmd_buffer->state.current_l3_config = cfg;
2114 }
2115 
2116 void
genX(cmd_buffer_apply_pipe_flushes)2117 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
2118 {
2119    UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;
2120    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
2121 
2122    if (unlikely(cmd_buffer->device->physical->always_flush_cache))
2123       bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
2124    else if (bits == 0)
2125       return;
2126 
2127    /*
2128     * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
2129     *
2130     *    Write synchronization is a special case of end-of-pipe
2131     *    synchronization that requires that the render cache and/or depth
2132     *    related caches are flushed to memory, where the data will become
2133     *    globally visible. This type of synchronization is required prior to
2134     *    SW (CPU) actually reading the result data from memory, or initiating
2135     *    an operation that will use as a read surface (such as a texture
2136     *    surface) a previous render target and/or depth/stencil buffer
2137     *
2138     *
2139     * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
2140     *
2141     *    Exercising the write cache flush bits (Render Target Cache Flush
2142     *    Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
2143     *    ensures the write caches are flushed and doesn't guarantee the data
2144     *    is globally visible.
2145     *
2146     *    SW can track the completion of the end-of-pipe-synchronization by
2147     *    using "Notify Enable" and "PostSync Operation - Write Immediate
2148     *    Data" in the PIPE_CONTROL command.
2149     *
2150     * In other words, flushes are pipelined while invalidations are handled
2151     * immediately.  Therefore, if we're flushing anything then we need to
2152     * schedule an end-of-pipe sync before any invalidations can happen.
2153     */
2154    if (bits & ANV_PIPE_FLUSH_BITS)
2155       bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2156 
2157 
2158    /* HSD 1209978178: docs say that before programming the aux table:
2159     *
2160     *    "Driver must ensure that the engine is IDLE but ensure it doesn't
2161     *    add extra flushes in the case it knows that the engine is already
2162     *    IDLE."
2163     */
2164    if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT))
2165       bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2166 
2167    /* If we're going to do an invalidate and we have a pending end-of-pipe
2168     * sync that has yet to be resolved, we do the end-of-pipe sync now.
2169     */
2170    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
2171        (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
2172       bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
2173       bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2174    }
2175 
2176    /* Wa_1409226450, Wait for EU to be idle before pipe control which
2177     * invalidates the instruction cache
2178     */
2179    if (GFX_VER == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT))
2180       bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2181 
2182    if ((GFX_VER >= 8 && GFX_VER <= 9) &&
2183        (bits & ANV_PIPE_CS_STALL_BIT) &&
2184        (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
2185       /* If we are doing a VF cache invalidate AND a CS stall (it must be
2186        * both) then we can reset our vertex cache tracking.
2187        */
2188       memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
2189              sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
2190       memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
2191              sizeof(cmd_buffer->state.gfx.ib_dirty_range));
2192    }
2193 
2194    /* Project: SKL / Argument: LRI Post Sync Operation [23]
2195     *
2196     * "PIPECONTROL command with “Command Streamer Stall Enable” must be
2197     *  programmed prior to programming a PIPECONTROL command with "LRI
2198     *  Post Sync Operation" in GPGPU mode of operation (i.e when
2199     *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
2200     *
2201     * The same text exists a few rows below for Post Sync Op.
2202     *
2203     * On Gfx12 this is Wa_1607156449.
2204     */
2205    if (bits & ANV_PIPE_POST_SYNC_BIT) {
2206       if ((GFX_VER == 9 || (GFX_VER == 12 && devinfo->revision == 0 /* A0 */)) &&
2207           cmd_buffer->state.current_pipeline == GPGPU)
2208          bits |= ANV_PIPE_CS_STALL_BIT;
2209       bits &= ~ANV_PIPE_POST_SYNC_BIT;
2210    }
2211 
2212    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
2213                ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
2214       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
2215 #if GFX_VER >= 12
2216          pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
2217          pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2218 #else
2219          /* Flushing HDC pipeline requires DC Flush on earlier HW. */
2220          pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2221 #endif
2222          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
2223          pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2224          pipe.RenderTargetCacheFlushEnable =
2225             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
2226 
2227          /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
2228           * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
2229           */
2230 #if GFX_VER >= 12
2231          pipe.DepthStallEnable =
2232             pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT);
2233 #else
2234          pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
2235 #endif
2236 
2237          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
2238          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2239 
2240          /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
2241           *
2242           *    "The most common action to perform upon reaching a
2243           *    synchronization point is to write a value out to memory. An
2244           *    immediate value (included with the synchronization command) may
2245           *    be written."
2246           *
2247           *
2248           * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
2249           *
2250           *    "In case the data flushed out by the render engine is to be
2251           *    read back in to the render engine in coherent manner, then the
2252           *    render engine has to wait for the fence completion before
2253           *    accessing the flushed data. This can be achieved by following
2254           *    means on various products: PIPE_CONTROL command with CS Stall
2255           *    and the required write caches flushed with Post-Sync-Operation
2256           *    as Write Immediate Data.
2257           *
2258           *    Example:
2259           *       - Workload-1 (3D/GPGPU/MEDIA)
2260           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
2261           *         Immediate Data, Required Write Cache Flush bits set)
2262           *       - Workload-2 (Can use the data produce or output by
2263           *         Workload-1)
2264           */
2265          if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
2266             pipe.CommandStreamerStallEnable = true;
2267             pipe.PostSyncOperation = WriteImmediateData;
2268             pipe.Address = cmd_buffer->device->workaround_address;
2269          }
2270 
2271          /*
2272           * According to the Broadwell documentation, any PIPE_CONTROL with the
2273           * "Command Streamer Stall" bit set must also have another bit set,
2274           * with five different options:
2275           *
2276           *  - Render Target Cache Flush
2277           *  - Depth Cache Flush
2278           *  - Stall at Pixel Scoreboard
2279           *  - Post-Sync Operation
2280           *  - Depth Stall
2281           *  - DC Flush Enable
2282           *
2283           * I chose "Stall at Pixel Scoreboard" since that's what we use in
2284           * mesa and it seems to work fine. The choice is fairly arbitrary.
2285           */
2286          if (pipe.CommandStreamerStallEnable &&
2287              !pipe.RenderTargetCacheFlushEnable &&
2288              !pipe.DepthCacheFlushEnable &&
2289              !pipe.StallAtPixelScoreboard &&
2290              !pipe.PostSyncOperation &&
2291              !pipe.DepthStallEnable &&
2292              !pipe.DCFlushEnable)
2293             pipe.StallAtPixelScoreboard = true;
2294          anv_debug_dump_pc(pipe);
2295       }
2296 
2297       /* If a render target flush was emitted, then we can toggle off the bit
2298        * saying that render target writes are ongoing.
2299        */
2300       if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
2301          bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
2302 
2303       if (GFX_VERx10 == 75) {
2304          /* Haswell needs addition work-arounds:
2305           *
2306           * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
2307           *
2308           *    Option 1:
2309           *    PIPE_CONTROL command with the CS Stall and the required write
2310           *    caches flushed with Post-SyncOperation as Write Immediate Data
2311           *    followed by eight dummy MI_STORE_DATA_IMM (write to scratch
2312           *    spce) commands.
2313           *
2314           *    Example:
2315           *       - Workload-1
2316           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
2317           *         Immediate Data, Required Write Cache Flush bits set)
2318           *       - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
2319           *       - Workload-2 (Can use the data produce or output by
2320           *         Workload-1)
2321           *
2322           * Unfortunately, both the PRMs and the internal docs are a bit
2323           * out-of-date in this regard.  What the windows driver does (and
2324           * this appears to actually work) is to emit a register read from the
2325           * memory address written by the pipe control above.
2326           *
2327           * What register we load into doesn't matter.  We choose an indirect
2328           * rendering register because we know it always exists and it's one
2329           * of the first registers the command parser allows us to write.  If
2330           * you don't have command parser support in your kernel (pre-4.2),
2331           * this will get turned into MI_NOOP and you won't get the
2332           * workaround.  Unfortunately, there's just not much we can do in
2333           * that case.  This register is perfectly safe to write since we
2334           * always re-load all of the indirect draw registers right before
2335           * 3DPRIMITIVE when needed anyway.
2336           */
2337          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2338             lrm.RegisterAddress  = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
2339             lrm.MemoryAddress = cmd_buffer->device->workaround_address;
2340          }
2341       }
2342 
2343       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
2344                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2345    }
2346 
2347    if (bits & ANV_PIPE_INVALIDATE_BITS) {
2348       /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2349        *
2350        *    "If the VF Cache Invalidation Enable is set to a 1 in a
2351        *    PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
2352        *    0, with the VF Cache Invalidation Enable set to 0 needs to be sent
2353        *    prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
2354        *    a 1."
2355        *
2356        * This appears to hang Broadwell, so we restrict it to just gfx9.
2357        */
2358       if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
2359          anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe);
2360 
2361       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
2362          pipe.StateCacheInvalidationEnable =
2363             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
2364          pipe.ConstantCacheInvalidationEnable =
2365             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
2366          pipe.VFCacheInvalidationEnable =
2367             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2368          pipe.TextureCacheInvalidationEnable =
2369             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
2370          pipe.InstructionCacheInvalidateEnable =
2371             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
2372 
2373          /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2374           *
2375           *    "When VF Cache Invalidate is set “Post Sync Operation” must be
2376           *    enabled to “Write Immediate Data” or “Write PS Depth Count” or
2377           *    “Write Timestamp”.
2378           */
2379          if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) {
2380             pipe.PostSyncOperation = WriteImmediateData;
2381             pipe.Address = cmd_buffer->device->workaround_address;
2382          }
2383          anv_debug_dump_pc(pipe);
2384       }
2385 
2386 #if GFX_VER == 12
2387       if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) &&
2388           cmd_buffer->device->info.has_aux_map) {
2389          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2390             lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);
2391             lri.DataDWord = 1;
2392          }
2393       }
2394 #endif
2395 
2396       bits &= ~ANV_PIPE_INVALIDATE_BITS;
2397    }
2398 
2399    cmd_buffer->state.pending_pipe_bits = bits;
2400 }
2401 
2402 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfoKHR * dep_info,const char * reason)2403 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
2404                    const VkDependencyInfoKHR *dep_info,
2405                    const char *reason)
2406 {
2407    /* XXX: Right now, we're really dumb and just flush whatever categories
2408     * the app asks for.  One of these days we may make this a bit better
2409     * but right now that's all the hardware allows for in most areas.
2410     */
2411    VkAccessFlags2KHR src_flags = 0;
2412    VkAccessFlags2KHR dst_flags = 0;
2413 
2414    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
2415       src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
2416       dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
2417    }
2418 
2419    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
2420       src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask;
2421       dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask;
2422    }
2423 
2424    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
2425       const VkImageMemoryBarrier2KHR *img_barrier =
2426          &dep_info->pImageMemoryBarriers[i];
2427 
2428       src_flags |= img_barrier->srcAccessMask;
2429       dst_flags |= img_barrier->dstAccessMask;
2430 
2431       ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
2432       const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
2433 
2434       uint32_t base_layer, layer_count;
2435       if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
2436          base_layer = 0;
2437          layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel);
2438       } else {
2439          base_layer = range->baseArrayLayer;
2440          layer_count = vk_image_subresource_layer_count(&image->vk, range);
2441       }
2442       const uint32_t level_count =
2443          vk_image_subresource_level_count(&image->vk, range);
2444 
2445       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2446          transition_depth_buffer(cmd_buffer, image,
2447                                  base_layer, layer_count,
2448                                  img_barrier->oldLayout,
2449                                  img_barrier->newLayout,
2450                                  false /* will_full_fast_clear */);
2451       }
2452 
2453       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2454          transition_stencil_buffer(cmd_buffer, image,
2455                                    range->baseMipLevel, level_count,
2456                                    base_layer, layer_count,
2457                                    img_barrier->oldLayout,
2458                                    img_barrier->newLayout,
2459                                    false /* will_full_fast_clear */);
2460       }
2461 
2462       if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
2463          VkImageAspectFlags color_aspects =
2464             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
2465          anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
2466             transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
2467                                     range->baseMipLevel, level_count,
2468                                     base_layer, layer_count,
2469                                     img_barrier->oldLayout,
2470                                     img_barrier->newLayout,
2471                                     img_barrier->srcQueueFamilyIndex,
2472                                     img_barrier->dstQueueFamilyIndex,
2473                                     false /* will_full_fast_clear */);
2474          }
2475       }
2476    }
2477 
2478    enum anv_pipe_bits bits =
2479       anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
2480       anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
2481 
2482    anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
2483 }
2484 
genX(CmdPipelineBarrier2KHR)2485 void genX(CmdPipelineBarrier2KHR)(
2486     VkCommandBuffer                             commandBuffer,
2487     const VkDependencyInfoKHR*                  pDependencyInfo)
2488 {
2489    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2490 
2491    cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
2492 }
2493 
2494 static void
cmd_buffer_alloc_push_constants(struct anv_cmd_buffer * cmd_buffer)2495 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
2496 {
2497    assert(anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline));
2498 
2499    VkShaderStageFlags stages =
2500       cmd_buffer->state.gfx.pipeline->active_stages;
2501 
2502    /* In order to avoid thrash, we assume that vertex and fragment stages
2503     * always exist.  In the rare case where one is missing *and* the other
2504     * uses push concstants, this may be suboptimal.  However, avoiding stalls
2505     * seems more important.
2506     */
2507    stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
2508 
2509    if (stages == cmd_buffer->state.gfx.push_constant_stages)
2510       return;
2511 
2512    const unsigned push_constant_kb =
2513       cmd_buffer->device->info.max_constant_urb_size_kb;
2514 
2515    const unsigned num_stages =
2516       util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
2517    unsigned size_per_stage = push_constant_kb / num_stages;
2518 
2519    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
2520     * units of 2KB.  Incidentally, these are the same platforms that have
2521     * 32KB worth of push constant space.
2522     */
2523    if (push_constant_kb == 32)
2524       size_per_stage &= ~1u;
2525 
2526    uint32_t kb_used = 0;
2527    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
2528       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
2529       anv_batch_emit(&cmd_buffer->batch,
2530                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
2531          alloc._3DCommandSubOpcode  = 18 + i;
2532          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
2533          alloc.ConstantBufferSize   = push_size;
2534       }
2535       kb_used += push_size;
2536    }
2537 
2538    anv_batch_emit(&cmd_buffer->batch,
2539                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
2540       alloc.ConstantBufferOffset = kb_used;
2541       alloc.ConstantBufferSize = push_constant_kb - kb_used;
2542    }
2543 
2544    cmd_buffer->state.gfx.push_constant_stages = stages;
2545 
2546    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
2547     *
2548     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
2549     *    the next 3DPRIMITIVE command after programming the
2550     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
2551     *
2552     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
2553     * pipeline setup, we need to dirty push constants.
2554     */
2555    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2556 }
2557 
2558 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)2559 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2560                    struct anv_cmd_pipeline_state *pipe_state,
2561                    struct anv_shader_bin *shader,
2562                    struct anv_state *bt_state)
2563 {
2564    struct anv_subpass *subpass = cmd_buffer->state.subpass;
2565    uint32_t state_offset;
2566 
2567    struct anv_pipeline_bind_map *map = &shader->bind_map;
2568    if (map->surface_count == 0) {
2569       *bt_state = (struct anv_state) { 0, };
2570       return VK_SUCCESS;
2571    }
2572 
2573    *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2574                                                   map->surface_count,
2575                                                   &state_offset);
2576    uint32_t *bt_map = bt_state->map;
2577 
2578    if (bt_state->map == NULL)
2579       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2580 
2581    /* We only need to emit relocs if we're not using softpin.  If we are using
2582     * softpin then we always keep all user-allocated memory objects resident.
2583     */
2584    const bool need_client_mem_relocs =
2585       !anv_use_softpin(cmd_buffer->device->physical);
2586    struct anv_push_constants *push = &pipe_state->push_constants;
2587 
2588    for (uint32_t s = 0; s < map->surface_count; s++) {
2589       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2590 
2591       struct anv_state surface_state;
2592 
2593       switch (binding->set) {
2594       case ANV_DESCRIPTOR_SET_NULL:
2595          bt_map[s] = 0;
2596          break;
2597 
2598       case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2599          /* Color attachment binding */
2600          assert(shader->stage == MESA_SHADER_FRAGMENT);
2601          if (binding->index < subpass->color_count) {
2602             const unsigned att =
2603                subpass->color_attachments[binding->index].attachment;
2604 
2605             /* From the Vulkan 1.0.46 spec:
2606              *
2607              *    "If any color or depth/stencil attachments are
2608              *    VK_ATTACHMENT_UNUSED, then no writes occur for those
2609              *    attachments."
2610              */
2611             if (att == VK_ATTACHMENT_UNUSED) {
2612                surface_state = cmd_buffer->state.null_surface_state;
2613             } else {
2614                surface_state = cmd_buffer->state.attachments[att].color.state;
2615             }
2616          } else {
2617             surface_state = cmd_buffer->state.null_surface_state;
2618          }
2619 
2620          assert(surface_state.map);
2621          bt_map[s] = surface_state.offset + state_offset;
2622          break;
2623 
2624       case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
2625          struct anv_state surface_state =
2626             anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2627 
2628          struct anv_address constant_data = {
2629             .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2630             .offset = shader->kernel.offset +
2631                       shader->prog_data->const_data_offset,
2632          };
2633          unsigned constant_data_size = shader->prog_data->const_data_size;
2634 
2635          const enum isl_format format =
2636             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2637                                                VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2638          anv_fill_buffer_surface_state(cmd_buffer->device,
2639                                        surface_state, format,
2640                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2641                                        constant_data, constant_data_size, 1);
2642 
2643          assert(surface_state.map);
2644          bt_map[s] = surface_state.offset + state_offset;
2645          add_surface_reloc(cmd_buffer, surface_state, constant_data);
2646          break;
2647       }
2648 
2649       case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2650          /* This is always the first binding for compute shaders */
2651          assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2652 
2653          struct anv_state surface_state =
2654             anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2655 
2656          const enum isl_format format =
2657             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2658                                                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2659          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2660                                        format,
2661                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2662                                        cmd_buffer->state.compute.num_workgroups,
2663                                        12, 1);
2664 
2665          assert(surface_state.map);
2666          bt_map[s] = surface_state.offset + state_offset;
2667          if (need_client_mem_relocs) {
2668             add_surface_reloc(cmd_buffer, surface_state,
2669                               cmd_buffer->state.compute.num_workgroups);
2670          }
2671          break;
2672       }
2673 
2674       case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2675          /* This is a descriptor set buffer so the set index is actually
2676           * given by binding->binding.  (Yes, that's confusing.)
2677           */
2678          struct anv_descriptor_set *set =
2679             pipe_state->descriptors[binding->index];
2680          assert(set->desc_mem.alloc_size);
2681          assert(set->desc_surface_state.alloc_size);
2682          bt_map[s] = set->desc_surface_state.offset + state_offset;
2683          add_surface_reloc(cmd_buffer, set->desc_surface_state,
2684                            anv_descriptor_set_address(set));
2685          break;
2686       }
2687 
2688       default: {
2689          assert(binding->set < MAX_SETS);
2690          const struct anv_descriptor_set *set =
2691             pipe_state->descriptors[binding->set];
2692          if (binding->index >= set->descriptor_count) {
2693             /* From the Vulkan spec section entitled "DescriptorSet and
2694              * Binding Assignment":
2695              *
2696              *    "If the array is runtime-sized, then array elements greater
2697              *    than or equal to the size of that binding in the bound
2698              *    descriptor set must not be used."
2699              *
2700              * Unfortunately, the compiler isn't smart enough to figure out
2701              * when a dynamic binding isn't used so it may grab the whole
2702              * array and stick it in the binding table.  In this case, it's
2703              * safe to just skip those bindings that are OOB.
2704              */
2705             assert(binding->index < set->layout->descriptor_count);
2706             continue;
2707          }
2708          const struct anv_descriptor *desc = &set->descriptors[binding->index];
2709 
2710          switch (desc->type) {
2711          case VK_DESCRIPTOR_TYPE_SAMPLER:
2712             /* Nothing for us to do here */
2713             continue;
2714 
2715          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2716          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: {
2717             if (desc->image_view) {
2718                struct anv_surface_state sstate =
2719                   (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
2720                   desc->image_view->planes[binding->plane].general_sampler_surface_state :
2721                   desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
2722                surface_state = sstate.state;
2723                assert(surface_state.alloc_size);
2724                if (need_client_mem_relocs)
2725                   add_surface_state_relocs(cmd_buffer, sstate);
2726             } else {
2727                surface_state = cmd_buffer->device->null_surface_state;
2728             }
2729             break;
2730          }
2731          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
2732             assert(shader->stage == MESA_SHADER_FRAGMENT);
2733             assert(desc->image_view != NULL);
2734             if ((desc->image_view->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) {
2735                /* For depth and stencil input attachments, we treat it like any
2736                 * old texture that a user may have bound.
2737                 */
2738                assert(desc->image_view->n_planes == 1);
2739                struct anv_surface_state sstate =
2740                   (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
2741                   desc->image_view->planes[0].general_sampler_surface_state :
2742                   desc->image_view->planes[0].optimal_sampler_surface_state;
2743                surface_state = sstate.state;
2744                assert(surface_state.alloc_size);
2745                if (need_client_mem_relocs)
2746                   add_surface_state_relocs(cmd_buffer, sstate);
2747             } else {
2748                /* For color input attachments, we create the surface state at
2749                 * vkBeginRenderPass time so that we can include aux and clear
2750                 * color information.
2751                 */
2752                assert(binding->input_attachment_index < subpass->input_count);
2753                const unsigned subpass_att = binding->input_attachment_index;
2754                const unsigned att = subpass->input_attachments[subpass_att].attachment;
2755                surface_state = cmd_buffer->state.attachments[att].input.state;
2756             }
2757             break;
2758 
2759          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
2760             if (desc->image_view) {
2761                struct anv_surface_state sstate =
2762                   binding->lowered_storage_surface
2763                   ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
2764                   : desc->image_view->planes[binding->plane].storage_surface_state;
2765                surface_state = sstate.state;
2766                assert(surface_state.alloc_size);
2767                if (surface_state.offset == 0) {
2768                   mesa_loge("Bound a image to a descriptor where the "
2769                             "descriptor does not have NonReadable "
2770                             "set and the image does not have a "
2771                             "corresponding SPIR-V format enum.");
2772                   vk_debug_report(&cmd_buffer->device->physical->instance->vk,
2773                                   VK_DEBUG_REPORT_ERROR_BIT_EXT,
2774                                   &desc->image_view->vk.base,
2775                                   __LINE__, 0, "anv",
2776                                   "Bound a image to a descriptor where the "
2777                                   "descriptor does not have NonReadable "
2778                                   "set and the image does not have a "
2779                                   "corresponding SPIR-V format enum.");
2780                }
2781                if (surface_state.offset && need_client_mem_relocs)
2782                   add_surface_state_relocs(cmd_buffer, sstate);
2783             } else {
2784                surface_state = cmd_buffer->device->null_surface_state;
2785             }
2786             break;
2787          }
2788 
2789          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2790          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2791          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2792             if (desc->buffer_view) {
2793                surface_state = desc->buffer_view->surface_state;
2794                assert(surface_state.alloc_size);
2795                if (need_client_mem_relocs) {
2796                   add_surface_reloc(cmd_buffer, surface_state,
2797                                     desc->buffer_view->address);
2798                }
2799             } else {
2800                surface_state = cmd_buffer->device->null_surface_state;
2801             }
2802             break;
2803 
2804          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2805          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2806             if (desc->buffer) {
2807                /* Compute the offset within the buffer */
2808                uint32_t dynamic_offset =
2809                   push->dynamic_offsets[binding->dynamic_offset_index];
2810                uint64_t offset = desc->offset + dynamic_offset;
2811                /* Clamp to the buffer size */
2812                offset = MIN2(offset, desc->buffer->size);
2813                /* Clamp the range to the buffer size */
2814                uint32_t range = MIN2(desc->range, desc->buffer->size - offset);
2815 
2816                /* Align the range for consistency */
2817                if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
2818                   range = align_u32(range, ANV_UBO_ALIGNMENT);
2819 
2820                struct anv_address address =
2821                   anv_address_add(desc->buffer->address, offset);
2822 
2823                surface_state =
2824                   anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
2825                enum isl_format format =
2826                   anv_isl_format_for_descriptor_type(cmd_buffer->device,
2827                                                      desc->type);
2828 
2829                isl_surf_usage_flags_t usage =
2830                   desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
2831                   ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
2832                   ISL_SURF_USAGE_STORAGE_BIT;
2833 
2834                anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2835                                              format, usage, address, range, 1);
2836                if (need_client_mem_relocs)
2837                   add_surface_reloc(cmd_buffer, surface_state, address);
2838             } else {
2839                surface_state = cmd_buffer->device->null_surface_state;
2840             }
2841             break;
2842          }
2843 
2844          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2845             if (desc->buffer_view) {
2846                surface_state = binding->lowered_storage_surface
2847                   ? desc->buffer_view->lowered_storage_surface_state
2848                   : desc->buffer_view->storage_surface_state;
2849                assert(surface_state.alloc_size);
2850                if (need_client_mem_relocs) {
2851                   add_surface_reloc(cmd_buffer, surface_state,
2852                                     desc->buffer_view->address);
2853                }
2854             } else {
2855                surface_state = cmd_buffer->device->null_surface_state;
2856             }
2857             break;
2858 
2859          default:
2860             assert(!"Invalid descriptor type");
2861             continue;
2862          }
2863          assert(surface_state.map);
2864          bt_map[s] = surface_state.offset + state_offset;
2865          break;
2866       }
2867       }
2868    }
2869 
2870    return VK_SUCCESS;
2871 }
2872 
2873 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2874 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2875               struct anv_cmd_pipeline_state *pipe_state,
2876               struct anv_shader_bin *shader,
2877               struct anv_state *state)
2878 {
2879    struct anv_pipeline_bind_map *map = &shader->bind_map;
2880    if (map->sampler_count == 0) {
2881       *state = (struct anv_state) { 0, };
2882       return VK_SUCCESS;
2883    }
2884 
2885    uint32_t size = map->sampler_count * 16;
2886    *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2887 
2888    if (state->map == NULL)
2889       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2890 
2891    for (uint32_t s = 0; s < map->sampler_count; s++) {
2892       struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2893       const struct anv_descriptor *desc =
2894          &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2895 
2896       if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2897           desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2898          continue;
2899 
2900       struct anv_sampler *sampler = desc->sampler;
2901 
2902       /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2903        * happens to be zero.
2904        */
2905       if (sampler == NULL)
2906          continue;
2907 
2908       memcpy(state->map + (s * 16),
2909              sampler->state[binding->plane], sizeof(sampler->state[0]));
2910    }
2911 
2912    return VK_SUCCESS;
2913 }
2914 
2915 static uint32_t
flush_descriptor_sets(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const VkShaderStageFlags dirty,struct anv_shader_bin ** shaders,uint32_t num_shaders)2916 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
2917                       struct anv_cmd_pipeline_state *pipe_state,
2918                       const VkShaderStageFlags dirty,
2919                       struct anv_shader_bin **shaders,
2920                       uint32_t num_shaders)
2921 {
2922    VkShaderStageFlags flushed = 0;
2923 
2924    VkResult result = VK_SUCCESS;
2925    for (uint32_t i = 0; i < num_shaders; i++) {
2926       if (!shaders[i])
2927          continue;
2928 
2929       gl_shader_stage stage = shaders[i]->stage;
2930       VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2931       if ((vk_stage & dirty) == 0)
2932          continue;
2933 
2934       assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2935       result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2936                              &cmd_buffer->state.samplers[stage]);
2937       if (result != VK_SUCCESS)
2938          break;
2939 
2940       assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2941       result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2942                                   &cmd_buffer->state.binding_tables[stage]);
2943       if (result != VK_SUCCESS)
2944          break;
2945 
2946       flushed |= vk_stage;
2947    }
2948 
2949    if (result != VK_SUCCESS) {
2950       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2951 
2952       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2953       if (result != VK_SUCCESS)
2954          return 0;
2955 
2956       /* Re-emit state base addresses so we get the new surface state base
2957        * address before we start emitting binding tables etc.
2958        */
2959       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2960 
2961       /* Re-emit all active binding tables */
2962       flushed = 0;
2963 
2964       for (uint32_t i = 0; i < num_shaders; i++) {
2965          if (!shaders[i])
2966             continue;
2967 
2968          gl_shader_stage stage = shaders[i]->stage;
2969 
2970          result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2971                                 &cmd_buffer->state.samplers[stage]);
2972          if (result != VK_SUCCESS) {
2973             anv_batch_set_error(&cmd_buffer->batch, result);
2974             return 0;
2975          }
2976          result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2977                                      &cmd_buffer->state.binding_tables[stage]);
2978          if (result != VK_SUCCESS) {
2979             anv_batch_set_error(&cmd_buffer->batch, result);
2980             return 0;
2981          }
2982 
2983          flushed |= mesa_to_vk_shader_stage(stage);
2984       }
2985    }
2986 
2987    return flushed;
2988 }
2989 
2990 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)2991 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
2992                                     uint32_t stages)
2993 {
2994    static const uint32_t sampler_state_opcodes[] = {
2995       [MESA_SHADER_VERTEX]                      = 43,
2996       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
2997       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
2998       [MESA_SHADER_GEOMETRY]                    = 46,
2999       [MESA_SHADER_FRAGMENT]                    = 47,
3000       [MESA_SHADER_COMPUTE]                     = 0,
3001    };
3002 
3003    static const uint32_t binding_table_opcodes[] = {
3004       [MESA_SHADER_VERTEX]                      = 38,
3005       [MESA_SHADER_TESS_CTRL]                   = 39,
3006       [MESA_SHADER_TESS_EVAL]                   = 40,
3007       [MESA_SHADER_GEOMETRY]                    = 41,
3008       [MESA_SHADER_FRAGMENT]                    = 42,
3009       [MESA_SHADER_COMPUTE]                     = 0,
3010    };
3011 
3012    anv_foreach_stage(s, stages) {
3013       assert(s < ARRAY_SIZE(binding_table_opcodes));
3014       assert(binding_table_opcodes[s] > 0);
3015 
3016       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
3017          anv_batch_emit(&cmd_buffer->batch,
3018                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
3019             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
3020             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
3021          }
3022       }
3023 
3024       /* Always emit binding table pointers if we're asked to, since on SKL
3025        * this is what flushes push constants. */
3026       anv_batch_emit(&cmd_buffer->batch,
3027                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
3028          btp._3DCommandSubOpcode = binding_table_opcodes[s];
3029          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
3030       }
3031    }
3032 }
3033 
3034 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)3035 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
3036                        const struct anv_shader_bin *shader,
3037                        const struct anv_push_range *range)
3038 {
3039    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3040    switch (range->set) {
3041    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
3042       /* This is a descriptor set buffer so the set index is
3043        * actually given by binding->binding.  (Yes, that's
3044        * confusing.)
3045        */
3046       struct anv_descriptor_set *set =
3047          gfx_state->base.descriptors[range->index];
3048       return anv_descriptor_set_address(set);
3049    }
3050 
3051    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
3052       if (gfx_state->base.push_constants_state.alloc_size == 0) {
3053          gfx_state->base.push_constants_state =
3054             anv_cmd_buffer_gfx_push_constants(cmd_buffer);
3055       }
3056       return (struct anv_address) {
3057          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3058          .offset = gfx_state->base.push_constants_state.offset,
3059       };
3060    }
3061 
3062    case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
3063       return (struct anv_address) {
3064          .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
3065          .offset = shader->kernel.offset +
3066                    shader->prog_data->const_data_offset,
3067       };
3068 
3069    default: {
3070       assert(range->set < MAX_SETS);
3071       struct anv_descriptor_set *set =
3072          gfx_state->base.descriptors[range->set];
3073       const struct anv_descriptor *desc =
3074          &set->descriptors[range->index];
3075 
3076       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
3077          if (desc->buffer_view)
3078             return desc->buffer_view->address;
3079       } else {
3080          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
3081          if (desc->buffer) {
3082             const struct anv_push_constants *push =
3083                &gfx_state->base.push_constants;
3084             uint32_t dynamic_offset =
3085                push->dynamic_offsets[range->dynamic_offset_index];
3086             return anv_address_add(desc->buffer->address,
3087                                    desc->offset + dynamic_offset);
3088          }
3089       }
3090 
3091       /* For NULL UBOs, we just return an address in the workaround BO.  We do
3092        * writes to it for workarounds but always at the bottom.  The higher
3093        * bytes should be all zeros.
3094        */
3095       assert(range->length * 32 <= 2048);
3096       return (struct anv_address) {
3097          .bo = cmd_buffer->device->workaround_bo,
3098          .offset = 1024,
3099       };
3100    }
3101    }
3102 }
3103 
3104 
3105 /** Returns the size in bytes of the bound buffer
3106  *
3107  * The range is relative to the start of the buffer, not the start of the
3108  * range.  The returned range may be smaller than
3109  *
3110  *    (range->start + range->length) * 32;
3111  */
3112 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)3113 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
3114                           const struct anv_shader_bin *shader,
3115                           const struct anv_push_range *range)
3116 {
3117    assert(shader->stage != MESA_SHADER_COMPUTE);
3118    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3119    switch (range->set) {
3120    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
3121       struct anv_descriptor_set *set =
3122          gfx_state->base.descriptors[range->index];
3123       assert(range->start * 32 < set->desc_mem.alloc_size);
3124       assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
3125       return set->desc_mem.alloc_size;
3126    }
3127 
3128    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
3129       return (range->start + range->length) * 32;
3130 
3131    case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
3132       return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
3133 
3134    default: {
3135       assert(range->set < MAX_SETS);
3136       struct anv_descriptor_set *set =
3137          gfx_state->base.descriptors[range->set];
3138       const struct anv_descriptor *desc =
3139          &set->descriptors[range->index];
3140 
3141       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
3142          if (!desc->buffer_view)
3143             return 0;
3144 
3145          if (range->start * 32 > desc->buffer_view->range)
3146             return 0;
3147 
3148          return desc->buffer_view->range;
3149       } else {
3150          if (!desc->buffer)
3151             return 0;
3152 
3153          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
3154          /* Compute the offset within the buffer */
3155          const struct anv_push_constants *push =
3156             &gfx_state->base.push_constants;
3157          uint32_t dynamic_offset =
3158             push->dynamic_offsets[range->dynamic_offset_index];
3159          uint64_t offset = desc->offset + dynamic_offset;
3160          /* Clamp to the buffer size */
3161          offset = MIN2(offset, desc->buffer->size);
3162          /* Clamp the range to the buffer size */
3163          uint32_t bound_range = MIN2(desc->range, desc->buffer->size - offset);
3164 
3165          /* Align the range for consistency */
3166          bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT);
3167 
3168          return bound_range;
3169       }
3170    }
3171    }
3172 }
3173 
3174 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)3175 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
3176                               gl_shader_stage stage,
3177                               struct anv_address *buffers,
3178                               unsigned buffer_count)
3179 {
3180    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3181    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3182 
3183    static const uint32_t push_constant_opcodes[] = {
3184       [MESA_SHADER_VERTEX]                      = 21,
3185       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3186       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3187       [MESA_SHADER_GEOMETRY]                    = 22,
3188       [MESA_SHADER_FRAGMENT]                    = 23,
3189       [MESA_SHADER_COMPUTE]                     = 0,
3190    };
3191 
3192    assert(stage < ARRAY_SIZE(push_constant_opcodes));
3193    assert(push_constant_opcodes[stage] > 0);
3194 
3195    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
3196       c._3DCommandSubOpcode = push_constant_opcodes[stage];
3197 
3198       if (anv_pipeline_has_stage(pipeline, stage)) {
3199          const struct anv_pipeline_bind_map *bind_map =
3200             &pipeline->shaders[stage]->bind_map;
3201 
3202 #if GFX_VER >= 9
3203          /* This field exists since Gfx8.  However, the Broadwell PRM says:
3204           *
3205           *    "Constant Buffer Object Control State must be always programmed
3206           *    to zero."
3207           *
3208           * This restriction does not exist on any newer platforms.
3209           *
3210           * We only have one MOCS field for the whole packet, not one per
3211           * buffer.  We could go out of our way here to walk over all of the
3212           * buffers and see if any of them are used externally and use the
3213           * external MOCS.  However, the notion that someone would use the
3214           * same bit of memory for both scanout and a UBO is nuts.  Let's not
3215           * bother and assume it's all internal.
3216           */
3217          c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
3218 #endif
3219 
3220 #if GFX_VERx10 >= 75
3221          /* The Skylake PRM contains the following restriction:
3222           *
3223           *    "The driver must ensure The following case does not occur
3224           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3225           *     buffer 3 read length equal to zero committed followed by a
3226           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3227           *     zero committed."
3228           *
3229           * To avoid this, we program the buffers in the highest slots.
3230           * This way, slot 0 is only used if slot 3 is also used.
3231           */
3232          assert(buffer_count <= 4);
3233          const unsigned shift = 4 - buffer_count;
3234          for (unsigned i = 0; i < buffer_count; i++) {
3235             const struct anv_push_range *range = &bind_map->push_ranges[i];
3236 
3237             /* At this point we only have non-empty ranges */
3238             assert(range->length > 0);
3239 
3240             /* For Ivy Bridge, make sure we only set the first range (actual
3241              * push constants)
3242              */
3243             assert((GFX_VERx10 >= 75) || i == 0);
3244 
3245             c.ConstantBody.ReadLength[i + shift] = range->length;
3246             c.ConstantBody.Buffer[i + shift] =
3247                anv_address_add(buffers[i], range->start * 32);
3248          }
3249 #else
3250          /* For Ivy Bridge, push constants are relative to dynamic state
3251           * base address and we only ever push actual push constants.
3252           */
3253          if (bind_map->push_ranges[0].length > 0) {
3254             assert(buffer_count == 1);
3255             assert(bind_map->push_ranges[0].set ==
3256                    ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
3257             assert(buffers[0].bo ==
3258                    cmd_buffer->device->dynamic_state_pool.block_pool.bo);
3259             c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
3260             c.ConstantBody.Buffer[0].bo = NULL;
3261             c.ConstantBody.Buffer[0].offset = buffers[0].offset;
3262          }
3263          assert(bind_map->push_ranges[1].length == 0);
3264          assert(bind_map->push_ranges[2].length == 0);
3265          assert(bind_map->push_ranges[3].length == 0);
3266 #endif
3267       }
3268    }
3269 }
3270 
3271 #if GFX_VER >= 12
3272 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer * cmd_buffer,uint32_t shader_mask,struct anv_address * buffers,uint32_t buffer_count)3273 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
3274                                   uint32_t shader_mask,
3275                                   struct anv_address *buffers,
3276                                   uint32_t buffer_count)
3277 {
3278    if (buffer_count == 0) {
3279       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
3280          c.ShaderUpdateEnable = shader_mask;
3281          c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
3282       }
3283       return;
3284    }
3285 
3286    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3287    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3288 
3289    static const UNUSED uint32_t push_constant_opcodes[] = {
3290       [MESA_SHADER_VERTEX]                      = 21,
3291       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3292       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3293       [MESA_SHADER_GEOMETRY]                    = 22,
3294       [MESA_SHADER_FRAGMENT]                    = 23,
3295       [MESA_SHADER_COMPUTE]                     = 0,
3296    };
3297 
3298    gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
3299    assert(stage < ARRAY_SIZE(push_constant_opcodes));
3300    assert(push_constant_opcodes[stage] > 0);
3301 
3302    const struct anv_pipeline_bind_map *bind_map =
3303       &pipeline->shaders[stage]->bind_map;
3304 
3305    uint32_t *dw;
3306    const uint32_t buffer_mask = (1 << buffer_count) - 1;
3307    const uint32_t num_dwords = 2 + 2 * buffer_count;
3308 
3309    dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3310                         GENX(3DSTATE_CONSTANT_ALL),
3311                         .ShaderUpdateEnable = shader_mask,
3312                         .PointerBufferMask = buffer_mask,
3313                         .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
3314 
3315    for (int i = 0; i < buffer_count; i++) {
3316       const struct anv_push_range *range = &bind_map->push_ranges[i];
3317       GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
3318          &cmd_buffer->batch, dw + 2 + i * 2,
3319          &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
3320             .PointerToConstantBuffer =
3321                anv_address_add(buffers[i], range->start * 32),
3322             .ConstantBufferReadLength = range->length,
3323          });
3324    }
3325 }
3326 #endif
3327 
3328 static void
cmd_buffer_flush_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)3329 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
3330                                 VkShaderStageFlags dirty_stages)
3331 {
3332    VkShaderStageFlags flushed = 0;
3333    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3334    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3335 
3336 #if GFX_VER >= 12
3337    uint32_t nobuffer_stages = 0;
3338 #endif
3339 
3340    /* Compute robust pushed register access mask for each stage. */
3341    if (cmd_buffer->device->robust_buffer_access) {
3342       anv_foreach_stage(stage, dirty_stages) {
3343          if (!anv_pipeline_has_stage(pipeline, stage))
3344             continue;
3345 
3346          const struct anv_shader_bin *shader = pipeline->shaders[stage];
3347          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3348          struct anv_push_constants *push = &gfx_state->base.push_constants;
3349 
3350          push->push_reg_mask[stage] = 0;
3351          /* Start of the current range in the shader, relative to the start of
3352           * push constants in the shader.
3353           */
3354          unsigned range_start_reg = 0;
3355          for (unsigned i = 0; i < 4; i++) {
3356             const struct anv_push_range *range = &bind_map->push_ranges[i];
3357             if (range->length == 0)
3358                continue;
3359 
3360             unsigned bound_size =
3361                get_push_range_bound_size(cmd_buffer, shader, range);
3362             if (bound_size >= range->start * 32) {
3363                unsigned bound_regs =
3364                   MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
3365                        range->length);
3366                assert(range_start_reg + bound_regs <= 64);
3367                push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
3368                                                               bound_regs);
3369             }
3370 
3371             cmd_buffer->state.push_constants_dirty |=
3372                mesa_to_vk_shader_stage(stage);
3373 
3374             range_start_reg += range->length;
3375          }
3376       }
3377    }
3378 
3379    /* Resets the push constant state so that we allocate a new one if
3380     * needed.
3381     */
3382    gfx_state->base.push_constants_state = ANV_STATE_NULL;
3383 
3384    anv_foreach_stage(stage, dirty_stages) {
3385       unsigned buffer_count = 0;
3386       flushed |= mesa_to_vk_shader_stage(stage);
3387       UNUSED uint32_t max_push_range = 0;
3388 
3389       struct anv_address buffers[4] = {};
3390       if (anv_pipeline_has_stage(pipeline, stage)) {
3391          const struct anv_shader_bin *shader = pipeline->shaders[stage];
3392          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3393 
3394          /* We have to gather buffer addresses as a second step because the
3395           * loop above puts data into the push constant area and the call to
3396           * get_push_range_address is what locks our push constants and copies
3397           * them into the actual GPU buffer.  If we did the two loops at the
3398           * same time, we'd risk only having some of the sizes in the push
3399           * constant buffer when we did the copy.
3400           */
3401          for (unsigned i = 0; i < 4; i++) {
3402             const struct anv_push_range *range = &bind_map->push_ranges[i];
3403             if (range->length == 0)
3404                break;
3405 
3406             buffers[i] = get_push_range_address(cmd_buffer, shader, range);
3407             max_push_range = MAX2(max_push_range, range->length);
3408             buffer_count++;
3409          }
3410 
3411          /* We have at most 4 buffers but they should be tightly packed */
3412          for (unsigned i = buffer_count; i < 4; i++)
3413             assert(bind_map->push_ranges[i].length == 0);
3414       }
3415 
3416 #if GFX_VER >= 12
3417       /* If this stage doesn't have any push constants, emit it later in a
3418        * single CONSTANT_ALL packet.
3419        */
3420       if (buffer_count == 0) {
3421          nobuffer_stages |= 1 << stage;
3422          continue;
3423       }
3424 
3425       /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
3426        * contains only 5 bits, so we can only use it for buffers smaller than
3427        * 32.
3428        */
3429       if (max_push_range < 32) {
3430          cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
3431                                            buffers, buffer_count);
3432          continue;
3433       }
3434 #endif
3435 
3436       cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
3437    }
3438 
3439 #if GFX_VER >= 12
3440    if (nobuffer_stages)
3441       cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
3442 #endif
3443 
3444    cmd_buffer->state.push_constants_dirty &= ~flushed;
3445 }
3446 
3447 static void
cmd_buffer_emit_clip(struct anv_cmd_buffer * cmd_buffer)3448 cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
3449 {
3450    const uint32_t clip_states =
3451 #if GFX_VER <= 7
3452       ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
3453       ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |
3454 #endif
3455       ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
3456       ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
3457       ANV_CMD_DIRTY_PIPELINE;
3458 
3459    if ((cmd_buffer->state.gfx.dirty & clip_states) == 0)
3460       return;
3461 
3462    /* Take dynamic primitive topology in to account with
3463     *    3DSTATE_CLIP::ViewportXYClipTestEnable
3464     */
3465    bool xy_clip_test_enable = 0;
3466 
3467    if (cmd_buffer->state.gfx.pipeline->dynamic_states &
3468        ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
3469       VkPrimitiveTopology primitive_topology =
3470          cmd_buffer->state.gfx.dynamic.primitive_topology;
3471 
3472       VkPolygonMode dynamic_raster_mode =
3473          genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
3474                                    primitive_topology);
3475 
3476       xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
3477    }
3478 
3479 #if GFX_VER <= 7
3480    const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
3481 #endif
3482    struct GENX(3DSTATE_CLIP) clip = {
3483       GENX(3DSTATE_CLIP_header),
3484 #if GFX_VER <= 7
3485       .FrontWinding = genX(vk_to_intel_front_face)[d->front_face],
3486       .CullMode     = genX(vk_to_intel_cullmode)[d->cull_mode],
3487 #endif
3488       .ViewportXYClipTestEnable = xy_clip_test_enable,
3489    };
3490    uint32_t dwords[GENX(3DSTATE_CLIP_length)];
3491 
3492    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3493    if (anv_pipeline_is_primitive(pipeline)) {
3494       const struct brw_vue_prog_data *last =
3495          anv_pipeline_get_last_vue_prog_data(pipeline);
3496       if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3497          clip.MaximumVPIndex =
3498             cmd_buffer->state.gfx.dynamic.viewport.count > 0 ?
3499             cmd_buffer->state.gfx.dynamic.viewport.count - 1 : 0;
3500       }
3501    }
3502 
3503    GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
3504    anv_batch_emit_merge(&cmd_buffer->batch, dwords,
3505                         pipeline->gfx7.clip);
3506 }
3507 
3508 static void
cmd_buffer_emit_streamout(struct anv_cmd_buffer * cmd_buffer)3509 cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
3510 {
3511    const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
3512    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3513 
3514 #if GFX_VER == 7
3515 #  define streamout_state_dw pipeline->gfx7.streamout_state
3516 #else
3517 #  define streamout_state_dw pipeline->gfx8.streamout_state
3518 #endif
3519 
3520    uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
3521 
3522    struct GENX(3DSTATE_STREAMOUT) so = {
3523       GENX(3DSTATE_STREAMOUT_header),
3524       .RenderingDisable = d->raster_discard,
3525    };
3526    GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
3527    anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
3528 }
3529 
3530 void
genX(cmd_buffer_flush_state)3531 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
3532 {
3533    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3534    uint32_t *p;
3535 
3536    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
3537 
3538    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
3539 
3540    genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
3541 
3542    genX(flush_pipeline_select_3d)(cmd_buffer);
3543 
3544    /* Apply any pending pipeline flushes we may have.  We want to apply them
3545     * now because, if any of those flushes are for things like push constants,
3546     * the GPU will read the state at weird times.
3547     */
3548    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3549 
3550    uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
3551    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3552       vb_emit |= pipeline->vb_used;
3553 
3554    if (vb_emit) {
3555       const uint32_t num_buffers = __builtin_popcount(vb_emit);
3556       const uint32_t num_dwords = 1 + num_buffers * 4;
3557 
3558       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3559                           GENX(3DSTATE_VERTEX_BUFFERS));
3560       uint32_t i = 0;
3561       u_foreach_bit(vb, vb_emit) {
3562          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
3563          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
3564 
3565          /* If dynamic, use stride/size from vertex binding, otherwise use
3566           * stride/size that was setup in the pipeline object.
3567           */
3568          bool dynamic_stride = cmd_buffer->state.gfx.dynamic.dyn_vbo_stride;
3569          bool dynamic_size = cmd_buffer->state.gfx.dynamic.dyn_vbo_size;
3570 
3571          struct GENX(VERTEX_BUFFER_STATE) state;
3572          if (buffer) {
3573             uint32_t stride = dynamic_stride ?
3574                cmd_buffer->state.vertex_bindings[vb].stride : pipeline->vb[vb].stride;
3575             /* From the Vulkan spec (vkCmdBindVertexBuffers2EXT):
3576              *
3577              * "If pname:pSizes is not NULL then pname:pSizes[i] specifies
3578              * the bound size of the vertex buffer starting from the corresponding
3579              * elements of pname:pBuffers[i] plus pname:pOffsets[i]."
3580              */
3581             UNUSED uint32_t size = dynamic_size ?
3582                cmd_buffer->state.vertex_bindings[vb].size : buffer->size - offset;
3583 
3584             state = (struct GENX(VERTEX_BUFFER_STATE)) {
3585                .VertexBufferIndex = vb,
3586 
3587                .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
3588                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3589 #if GFX_VER <= 7
3590                .BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA,
3591                .InstanceDataStepRate = pipeline->vb[vb].instance_divisor,
3592 #endif
3593                .AddressModifyEnable = true,
3594                .BufferPitch = stride,
3595                .BufferStartingAddress = anv_address_add(buffer->address, offset),
3596                .NullVertexBuffer = offset >= buffer->size,
3597 #if GFX_VER >= 12
3598                .L3BypassDisable = true,
3599 #endif
3600 
3601 #if GFX_VER >= 8
3602                .BufferSize = size,
3603 #else
3604                /* XXX: to handle dynamic offset for older gens we might want
3605                 * to modify Endaddress, but there are issues when doing so:
3606                 *
3607                 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
3608                 */
3609                .EndAddress = anv_address_add(buffer->address, buffer->size - 1),
3610 #endif
3611             };
3612          } else {
3613             state = (struct GENX(VERTEX_BUFFER_STATE)) {
3614                .VertexBufferIndex = vb,
3615                .NullVertexBuffer = true,
3616             };
3617          }
3618 
3619 #if GFX_VER >= 8 && GFX_VER <= 9
3620          genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
3621                                                         state.BufferStartingAddress,
3622                                                         state.BufferSize);
3623 #endif
3624 
3625          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
3626          i++;
3627       }
3628    }
3629 
3630    cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
3631 
3632    uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
3633                                 pipeline->active_stages;
3634    if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
3635        !cmd_buffer->state.push_constants_dirty)
3636       return;
3637 
3638    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
3639        (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
3640                          ANV_CMD_DIRTY_PIPELINE))) {
3641       /* We don't need any per-buffer dirty tracking because you're not
3642        * allowed to bind different XFB buffers while XFB is enabled.
3643        */
3644       for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
3645          struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
3646          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
3647 #if GFX_VER < 12
3648             sob.SOBufferIndex = idx;
3649 #else
3650             sob._3DCommandOpcode = 0;
3651             sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
3652 #endif
3653 
3654             if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
3655                sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0);
3656                sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
3657                                                         xfb->offset);
3658 #if GFX_VER >= 8
3659                sob.SOBufferEnable = true;
3660                sob.StreamOffsetWriteEnable = false;
3661                /* Size is in DWords - 1 */
3662                sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
3663 #else
3664                /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
3665                 * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
3666                 * default for an empty SO_BUFFER packet) to disable them.
3667                 */
3668                sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
3669                sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
3670                                                        xfb->offset + xfb->size);
3671 #endif
3672             }
3673          }
3674       }
3675 
3676       /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
3677       if (GFX_VER >= 10) {
3678          anv_add_pending_pipe_bits(cmd_buffer,
3679                                    ANV_PIPE_CS_STALL_BIT,
3680                                    "after 3DSTATE_SO_BUFFER call");
3681       }
3682    }
3683 
3684    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
3685       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
3686 
3687       /* Remove from dynamic state emission all of stuff that is baked into
3688        * the pipeline.
3689        */
3690       cmd_buffer->state.gfx.dirty &= ~pipeline->static_state_mask;
3691 
3692       /* If the pipeline changed, we may need to re-allocate push constant
3693        * space in the URB.
3694        */
3695       cmd_buffer_alloc_push_constants(cmd_buffer);
3696    }
3697 
3698    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3699       cmd_buffer->state.gfx.primitive_topology = pipeline->topology;
3700 
3701 #if GFX_VER <= 7
3702    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
3703        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
3704       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
3705        *
3706        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
3707        *    stall needs to be sent just prior to any 3DSTATE_VS,
3708        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
3709        *    3DSTATE_BINDING_TABLE_POINTER_VS,
3710        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
3711        *    PIPE_CONTROL needs to be sent before any combination of VS
3712        *    associated 3DSTATE."
3713        */
3714       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3715          pc.DepthStallEnable  = true;
3716          pc.PostSyncOperation = WriteImmediateData;
3717          pc.Address           = cmd_buffer->device->workaround_address;
3718          anv_debug_dump_pc(pc);
3719       }
3720    }
3721 #endif
3722 
3723    /* Render targets live in the same binding table as fragment descriptors */
3724    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
3725       descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
3726 
3727    /* We emit the binding tables and sampler tables first, then emit push
3728     * constants and then finally emit binding table and sampler table
3729     * pointers.  It has to happen in this order, since emitting the binding
3730     * tables may change the push constants (in case of storage images). After
3731     * emitting push constants, on SKL+ we have to emit the corresponding
3732     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
3733     */
3734    uint32_t dirty = 0;
3735    if (descriptors_dirty) {
3736       dirty = flush_descriptor_sets(cmd_buffer,
3737                                     &cmd_buffer->state.gfx.base,
3738                                     descriptors_dirty,
3739                                     pipeline->shaders,
3740                                     ARRAY_SIZE(pipeline->shaders));
3741       cmd_buffer->state.descriptors_dirty &= ~dirty;
3742    }
3743 
3744    if (dirty || cmd_buffer->state.push_constants_dirty) {
3745       /* Because we're pushing UBOs, we have to push whenever either
3746        * descriptors or push constants is dirty.
3747        */
3748       dirty |= cmd_buffer->state.push_constants_dirty;
3749       dirty &= ANV_STAGE_MASK & VK_SHADER_STAGE_ALL_GRAPHICS;
3750       cmd_buffer_flush_push_constants(cmd_buffer, dirty);
3751    }
3752 
3753    if (dirty)
3754       cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
3755 
3756    cmd_buffer_emit_clip(cmd_buffer);
3757 
3758    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
3759       cmd_buffer_emit_streamout(cmd_buffer);
3760 
3761    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
3762       gfx8_cmd_buffer_emit_viewport(cmd_buffer);
3763 
3764    if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
3765                                   ANV_CMD_DIRTY_PIPELINE)) {
3766       gfx8_cmd_buffer_emit_depth_viewport(cmd_buffer,
3767                                           pipeline->depth_clamp_enable);
3768    }
3769 
3770    if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_SCISSOR |
3771                                       ANV_CMD_DIRTY_RENDER_TARGETS))
3772       gfx7_cmd_buffer_emit_scissor(cmd_buffer);
3773 
3774    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
3775 }
3776 
3777 static void
emit_vertex_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,uint32_t size,uint32_t index)3778 emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
3779                struct anv_address addr,
3780                uint32_t size, uint32_t index)
3781 {
3782    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
3783                                  GENX(3DSTATE_VERTEX_BUFFERS));
3784 
3785    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
3786       &(struct GENX(VERTEX_BUFFER_STATE)) {
3787          .VertexBufferIndex = index,
3788          .AddressModifyEnable = true,
3789          .BufferPitch = 0,
3790          .MOCS = addr.bo ? anv_mocs(cmd_buffer->device, addr.bo,
3791                                     ISL_SURF_USAGE_VERTEX_BUFFER_BIT) : 0,
3792          .NullVertexBuffer = size == 0,
3793 #if GFX_VER >= 12
3794          .L3BypassDisable = true,
3795 #endif
3796 #if (GFX_VER >= 8)
3797          .BufferStartingAddress = addr,
3798          .BufferSize = size
3799 #else
3800          .BufferStartingAddress = addr,
3801          .EndAddress = anv_address_add(addr, size),
3802 #endif
3803       });
3804 
3805    genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
3806                                                   index, addr, size);
3807 }
3808 
3809 static void
emit_base_vertex_instance_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)3810 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
3811                              struct anv_address addr)
3812 {
3813    emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
3814 }
3815 
3816 static void
emit_base_vertex_instance(struct anv_cmd_buffer * cmd_buffer,uint32_t base_vertex,uint32_t base_instance)3817 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
3818                           uint32_t base_vertex, uint32_t base_instance)
3819 {
3820    if (base_vertex == 0 && base_instance == 0) {
3821       emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
3822    } else {
3823       struct anv_state id_state =
3824          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
3825 
3826       ((uint32_t *)id_state.map)[0] = base_vertex;
3827       ((uint32_t *)id_state.map)[1] = base_instance;
3828 
3829       struct anv_address addr = {
3830          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3831          .offset = id_state.offset,
3832       };
3833 
3834       emit_base_vertex_instance_bo(cmd_buffer, addr);
3835    }
3836 }
3837 
3838 static void
emit_draw_index(struct anv_cmd_buffer * cmd_buffer,uint32_t draw_index)3839 emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
3840 {
3841    struct anv_state state =
3842       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
3843 
3844    ((uint32_t *)state.map)[0] = draw_index;
3845 
3846    struct anv_address addr = {
3847       .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3848       .offset = state.offset,
3849    };
3850 
3851    emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
3852 }
3853 
3854 static void
update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer * cmd_buffer,uint32_t access_type)3855 update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
3856                                    uint32_t access_type)
3857 {
3858    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3859    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3860 
3861    uint64_t vb_used = pipeline->vb_used;
3862    if (vs_prog_data->uses_firstvertex ||
3863        vs_prog_data->uses_baseinstance)
3864       vb_used |= 1ull << ANV_SVGS_VB_INDEX;
3865    if (vs_prog_data->uses_drawid)
3866       vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
3867 
3868    genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
3869                                                        access_type == RANDOM,
3870                                                        vb_used);
3871 }
3872 
3873 ALWAYS_INLINE static void
cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer * cmd_buffer,const struct brw_vs_prog_data * vs_prog_data,uint32_t base_vertex,uint32_t base_instance,uint32_t draw_id,bool force_flush)3874 cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
3875                                            const struct brw_vs_prog_data *vs_prog_data,
3876                                            uint32_t base_vertex,
3877                                            uint32_t base_instance,
3878                                            uint32_t draw_id,
3879                                            bool force_flush)
3880 {
3881    bool emitted = false;
3882    if (vs_prog_data->uses_firstvertex ||
3883        vs_prog_data->uses_baseinstance) {
3884       emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
3885       emitted = true;
3886    }
3887    if (vs_prog_data->uses_drawid) {
3888       emit_draw_index(cmd_buffer, draw_id);
3889       emitted = true;
3890    }
3891    /* Emitting draw index or vertex index BOs may result in needing
3892     * additional VF cache flushes.
3893     */
3894    if (emitted || force_flush)
3895       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3896 }
3897 
genX(CmdDraw)3898 void genX(CmdDraw)(
3899     VkCommandBuffer                             commandBuffer,
3900     uint32_t                                    vertexCount,
3901     uint32_t                                    instanceCount,
3902     uint32_t                                    firstVertex,
3903     uint32_t                                    firstInstance)
3904 {
3905    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3906    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3907    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3908 
3909    if (anv_batch_has_error(&cmd_buffer->batch))
3910       return;
3911 
3912    const uint32_t count = (vertexCount *
3913                            instanceCount *
3914                            (pipeline->use_primitive_replication ?
3915                             1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
3916    anv_measure_snapshot(cmd_buffer,
3917                         INTEL_SNAPSHOT_DRAW,
3918                         "draw", count);
3919 
3920    genX(cmd_buffer_flush_state)(cmd_buffer);
3921 
3922    if (cmd_buffer->state.conditional_render_enabled)
3923       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3924 
3925    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3926                                               firstVertex, firstInstance, 0,
3927                                               true);
3928 
3929    /* Our implementation of VK_KHR_multiview uses instancing to draw the
3930     * different views.  We need to multiply instanceCount by the view count.
3931     */
3932    if (!pipeline->use_primitive_replication)
3933       instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
3934 
3935    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3936       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3937       prim.VertexAccessType         = SEQUENTIAL;
3938       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3939       prim.VertexCountPerInstance   = vertexCount;
3940       prim.StartVertexLocation      = firstVertex;
3941       prim.InstanceCount            = instanceCount;
3942       prim.StartInstanceLocation    = firstInstance;
3943       prim.BaseVertexLocation       = 0;
3944    }
3945 
3946    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3947 }
3948 
genX(CmdDrawMultiEXT)3949 void genX(CmdDrawMultiEXT)(
3950     VkCommandBuffer                             commandBuffer,
3951     uint32_t                                    drawCount,
3952     const VkMultiDrawInfoEXT                   *pVertexInfo,
3953     uint32_t                                    instanceCount,
3954     uint32_t                                    firstInstance,
3955     uint32_t                                    stride)
3956 {
3957    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3958    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3959    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3960 
3961    if (anv_batch_has_error(&cmd_buffer->batch))
3962       return;
3963 
3964    const uint32_t count = (drawCount *
3965                            instanceCount *
3966                            (pipeline->use_primitive_replication ?
3967                             1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
3968    anv_measure_snapshot(cmd_buffer,
3969                         INTEL_SNAPSHOT_DRAW,
3970                         "draw_multi", count);
3971 
3972    genX(cmd_buffer_flush_state)(cmd_buffer);
3973 
3974    if (cmd_buffer->state.conditional_render_enabled)
3975       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3976 
3977    /* Our implementation of VK_KHR_multiview uses instancing to draw the
3978     * different views.  We need to multiply instanceCount by the view count.
3979     */
3980    if (!pipeline->use_primitive_replication)
3981       instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
3982 
3983    uint32_t i = 0;
3984    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
3985       cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3986                                                  draw->firstVertex,
3987                                                  firstInstance, i, !i);
3988 
3989       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3990          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3991          prim.VertexAccessType         = SEQUENTIAL;
3992          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3993          prim.VertexCountPerInstance   = draw->vertexCount;
3994          prim.StartVertexLocation      = draw->firstVertex;
3995          prim.InstanceCount            = instanceCount;
3996          prim.StartInstanceLocation    = firstInstance;
3997          prim.BaseVertexLocation       = 0;
3998       }
3999    }
4000 
4001    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4002 }
4003 
genX(CmdDrawIndexed)4004 void genX(CmdDrawIndexed)(
4005     VkCommandBuffer                             commandBuffer,
4006     uint32_t                                    indexCount,
4007     uint32_t                                    instanceCount,
4008     uint32_t                                    firstIndex,
4009     int32_t                                     vertexOffset,
4010     uint32_t                                    firstInstance)
4011 {
4012    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4013    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4014    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4015 
4016    if (anv_batch_has_error(&cmd_buffer->batch))
4017       return;
4018 
4019    const uint32_t count = (indexCount *
4020                            instanceCount *
4021                            (pipeline->use_primitive_replication ?
4022                             1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
4023    anv_measure_snapshot(cmd_buffer,
4024                         INTEL_SNAPSHOT_DRAW,
4025                         "draw indexed",
4026                         count);
4027 
4028    genX(cmd_buffer_flush_state)(cmd_buffer);
4029 
4030    if (cmd_buffer->state.conditional_render_enabled)
4031       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4032 
4033    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true);
4034 
4035    /* Our implementation of VK_KHR_multiview uses instancing to draw the
4036     * different views.  We need to multiply instanceCount by the view count.
4037     */
4038    if (!pipeline->use_primitive_replication)
4039       instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
4040 
4041    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4042       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4043       prim.VertexAccessType         = RANDOM;
4044       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4045       prim.VertexCountPerInstance   = indexCount;
4046       prim.StartVertexLocation      = firstIndex;
4047       prim.InstanceCount            = instanceCount;
4048       prim.StartInstanceLocation    = firstInstance;
4049       prim.BaseVertexLocation       = vertexOffset;
4050    }
4051 
4052    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4053 }
4054 
genX(CmdDrawMultiIndexedEXT)4055 void genX(CmdDrawMultiIndexedEXT)(
4056     VkCommandBuffer                             commandBuffer,
4057     uint32_t                                    drawCount,
4058     const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
4059     uint32_t                                    instanceCount,
4060     uint32_t                                    firstInstance,
4061     uint32_t                                    stride,
4062     const int32_t                              *pVertexOffset)
4063 {
4064    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4065    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4066    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4067 
4068    if (anv_batch_has_error(&cmd_buffer->batch))
4069       return;
4070 
4071    const uint32_t count = (drawCount *
4072                            instanceCount *
4073                            (pipeline->use_primitive_replication ?
4074                             1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
4075    anv_measure_snapshot(cmd_buffer,
4076                         INTEL_SNAPSHOT_DRAW,
4077                         "draw indexed_multi",
4078                         count);
4079 
4080    genX(cmd_buffer_flush_state)(cmd_buffer);
4081 
4082    if (cmd_buffer->state.conditional_render_enabled)
4083       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4084 
4085    /* Our implementation of VK_KHR_multiview uses instancing to draw the
4086     * different views.  We need to multiply instanceCount by the view count.
4087     */
4088    if (!pipeline->use_primitive_replication)
4089       instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
4090 
4091    uint32_t i = 0;
4092    if (pVertexOffset) {
4093       if (vs_prog_data->uses_drawid) {
4094          bool emitted = true;
4095          if (vs_prog_data->uses_firstvertex ||
4096              vs_prog_data->uses_baseinstance) {
4097             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
4098             emitted = true;
4099          }
4100          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4101             if (vs_prog_data->uses_drawid) {
4102                emit_draw_index(cmd_buffer, i);
4103                emitted = true;
4104             }
4105             /* Emitting draw index or vertex index BOs may result in needing
4106              * additional VF cache flushes.
4107              */
4108             if (emitted)
4109                genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4110 
4111             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4112                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4113                prim.VertexAccessType         = RANDOM;
4114                prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4115                prim.VertexCountPerInstance   = draw->indexCount;
4116                prim.StartVertexLocation      = draw->firstIndex;
4117                prim.InstanceCount            = instanceCount;
4118                prim.StartInstanceLocation    = firstInstance;
4119                prim.BaseVertexLocation       = *pVertexOffset;
4120             }
4121             emitted = false;
4122          }
4123       } else {
4124          if (vs_prog_data->uses_firstvertex ||
4125              vs_prog_data->uses_baseinstance) {
4126             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
4127             /* Emitting draw index or vertex index BOs may result in needing
4128              * additional VF cache flushes.
4129              */
4130             genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4131          }
4132          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4133             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4134                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4135                prim.VertexAccessType         = RANDOM;
4136                prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4137                prim.VertexCountPerInstance   = draw->indexCount;
4138                prim.StartVertexLocation      = draw->firstIndex;
4139                prim.InstanceCount            = instanceCount;
4140                prim.StartInstanceLocation    = firstInstance;
4141                prim.BaseVertexLocation       = *pVertexOffset;
4142             }
4143          }
4144       }
4145    } else {
4146       vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4147          cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4148                                                     draw->vertexOffset,
4149                                                     firstInstance, i, i != 0);
4150 
4151          anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4152             prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4153             prim.VertexAccessType         = RANDOM;
4154             prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4155             prim.VertexCountPerInstance   = draw->indexCount;
4156             prim.StartVertexLocation      = draw->firstIndex;
4157             prim.InstanceCount            = instanceCount;
4158             prim.StartInstanceLocation    = firstInstance;
4159             prim.BaseVertexLocation       = draw->vertexOffset;
4160          }
4161       }
4162    }
4163 
4164    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4165 }
4166 
4167 /* Auto-Draw / Indirect Registers */
4168 #define GFX7_3DPRIM_END_OFFSET          0x2420
4169 #define GFX7_3DPRIM_START_VERTEX        0x2430
4170 #define GFX7_3DPRIM_VERTEX_COUNT        0x2434
4171 #define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
4172 #define GFX7_3DPRIM_START_INSTANCE      0x243C
4173 #define GFX7_3DPRIM_BASE_VERTEX         0x2440
4174 
genX(CmdDrawIndirectByteCountEXT)4175 void genX(CmdDrawIndirectByteCountEXT)(
4176     VkCommandBuffer                             commandBuffer,
4177     uint32_t                                    instanceCount,
4178     uint32_t                                    firstInstance,
4179     VkBuffer                                    counterBuffer,
4180     VkDeviceSize                                counterBufferOffset,
4181     uint32_t                                    counterOffset,
4182     uint32_t                                    vertexStride)
4183 {
4184 #if GFX_VERx10 >= 75
4185    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4186    ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
4187    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4188    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4189 
4190    /* firstVertex is always zero for this draw function */
4191    const uint32_t firstVertex = 0;
4192 
4193    if (anv_batch_has_error(&cmd_buffer->batch))
4194       return;
4195 
4196    anv_measure_snapshot(cmd_buffer,
4197                         INTEL_SNAPSHOT_DRAW,
4198                         "draw indirect byte count",
4199                         instanceCount);
4200 
4201    genX(cmd_buffer_flush_state)(cmd_buffer);
4202 
4203    if (vs_prog_data->uses_firstvertex ||
4204        vs_prog_data->uses_baseinstance)
4205       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
4206    if (vs_prog_data->uses_drawid)
4207       emit_draw_index(cmd_buffer, 0);
4208 
4209    /* Emitting draw index or vertex index BOs may result in needing
4210     * additional VF cache flushes.
4211     */
4212    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4213 
4214    /* Our implementation of VK_KHR_multiview uses instancing to draw the
4215     * different views.  We need to multiply instanceCount by the view count.
4216     */
4217    if (!pipeline->use_primitive_replication)
4218       instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
4219 
4220    struct mi_builder b;
4221    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4222    struct mi_value count =
4223       mi_mem32(anv_address_add(counter_buffer->address,
4224                                    counterBufferOffset));
4225    if (counterOffset)
4226       count = mi_isub(&b, count, mi_imm(counterOffset));
4227    count = mi_udiv32_imm(&b, count, vertexStride);
4228    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
4229 
4230    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
4231    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), mi_imm(instanceCount));
4232    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
4233    mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
4234 
4235    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4236       prim.IndirectParameterEnable  = true;
4237       prim.VertexAccessType         = SEQUENTIAL;
4238       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4239    }
4240 
4241    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4242 #endif /* GFX_VERx10 >= 75 */
4243 }
4244 
4245 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed)4246 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
4247                          struct anv_address addr,
4248                          bool indexed)
4249 {
4250    struct mi_builder b;
4251    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4252 
4253    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
4254                 mi_mem32(anv_address_add(addr, 0)));
4255 
4256    struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
4257    unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass);
4258    if (view_count > 1) {
4259 #if GFX_VERx10 >= 75
4260       instance_count = mi_imul_imm(&b, instance_count, view_count);
4261 #else
4262       anv_finishme("Multiview + indirect draw requires MI_MATH; "
4263                    "MI_MATH is not supported on Ivy Bridge");
4264 #endif
4265    }
4266    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
4267 
4268    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
4269                 mi_mem32(anv_address_add(addr, 8)));
4270 
4271    if (indexed) {
4272       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
4273                    mi_mem32(anv_address_add(addr, 12)));
4274       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
4275                    mi_mem32(anv_address_add(addr, 16)));
4276    } else {
4277       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
4278                    mi_mem32(anv_address_add(addr, 12)));
4279       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
4280    }
4281 }
4282 
genX(CmdDrawIndirect)4283 void genX(CmdDrawIndirect)(
4284     VkCommandBuffer                             commandBuffer,
4285     VkBuffer                                    _buffer,
4286     VkDeviceSize                                offset,
4287     uint32_t                                    drawCount,
4288     uint32_t                                    stride)
4289 {
4290    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4291    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4292    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4293    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4294 
4295    if (anv_batch_has_error(&cmd_buffer->batch))
4296       return;
4297 
4298    genX(cmd_buffer_flush_state)(cmd_buffer);
4299 
4300    if (cmd_buffer->state.conditional_render_enabled)
4301       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4302 
4303    for (uint32_t i = 0; i < drawCount; i++) {
4304       struct anv_address draw = anv_address_add(buffer->address, offset);
4305 
4306       if (vs_prog_data->uses_firstvertex ||
4307           vs_prog_data->uses_baseinstance)
4308          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4309       if (vs_prog_data->uses_drawid)
4310          emit_draw_index(cmd_buffer, i);
4311 
4312       /* Emitting draw index or vertex index BOs may result in needing
4313        * additional VF cache flushes.
4314        */
4315       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4316 
4317       load_indirect_parameters(cmd_buffer, draw, false);
4318 
4319       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4320          prim.IndirectParameterEnable  = true;
4321          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4322          prim.VertexAccessType         = SEQUENTIAL;
4323          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4324       }
4325 
4326       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4327 
4328       offset += stride;
4329    }
4330 }
4331 
genX(CmdDrawIndexedIndirect)4332 void genX(CmdDrawIndexedIndirect)(
4333     VkCommandBuffer                             commandBuffer,
4334     VkBuffer                                    _buffer,
4335     VkDeviceSize                                offset,
4336     uint32_t                                    drawCount,
4337     uint32_t                                    stride)
4338 {
4339    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4340    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4341    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4342    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4343 
4344    if (anv_batch_has_error(&cmd_buffer->batch))
4345       return;
4346 
4347    genX(cmd_buffer_flush_state)(cmd_buffer);
4348 
4349    if (cmd_buffer->state.conditional_render_enabled)
4350       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4351 
4352    for (uint32_t i = 0; i < drawCount; i++) {
4353       struct anv_address draw = anv_address_add(buffer->address, offset);
4354 
4355       /* TODO: We need to stomp base vertex to 0 somehow */
4356       if (vs_prog_data->uses_firstvertex ||
4357           vs_prog_data->uses_baseinstance)
4358          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4359       if (vs_prog_data->uses_drawid)
4360          emit_draw_index(cmd_buffer, i);
4361 
4362       /* Emitting draw index or vertex index BOs may result in needing
4363        * additional VF cache flushes.
4364        */
4365       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4366 
4367       load_indirect_parameters(cmd_buffer, draw, true);
4368 
4369       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4370          prim.IndirectParameterEnable  = true;
4371          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4372          prim.VertexAccessType         = RANDOM;
4373          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4374       }
4375 
4376       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4377 
4378       offset += stride;
4379    }
4380 }
4381 
4382 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_buffer * count_buffer,uint64_t countBufferOffset)4383 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4384                                  struct mi_builder *b,
4385                                  struct anv_buffer *count_buffer,
4386                                  uint64_t countBufferOffset)
4387 {
4388    struct anv_address count_address =
4389          anv_address_add(count_buffer->address, countBufferOffset);
4390 
4391    struct mi_value ret = mi_imm(0);
4392 
4393    if (cmd_buffer->state.conditional_render_enabled) {
4394 #if GFX_VERx10 >= 75
4395       ret = mi_new_gpr(b);
4396       mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
4397 #endif
4398    } else {
4399       /* Upload the current draw count from the draw parameters buffer to
4400        * MI_PREDICATE_SRC0.
4401        */
4402       mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
4403       mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
4404    }
4405 
4406    return ret;
4407 }
4408 
4409 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)4410 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4411                           struct mi_builder *b,
4412                           uint32_t draw_index)
4413 {
4414    /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
4415    mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
4416 
4417    if (draw_index == 0) {
4418       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4419          mip.LoadOperation    = LOAD_LOADINV;
4420          mip.CombineOperation = COMBINE_SET;
4421          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4422       }
4423    } else {
4424       /* While draw_index < draw_count the predicate's result will be
4425        *  (draw_index == draw_count) ^ TRUE = TRUE
4426        * When draw_index == draw_count the result is
4427        *  (TRUE) ^ TRUE = FALSE
4428        * After this all results will be:
4429        *  (FALSE) ^ FALSE = FALSE
4430        */
4431       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4432          mip.LoadOperation    = LOAD_LOAD;
4433          mip.CombineOperation = COMBINE_XOR;
4434          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4435       }
4436    }
4437 }
4438 
4439 #if GFX_VERx10 >= 75
4440 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4441 emit_draw_count_predicate_with_conditional_render(
4442                           struct anv_cmd_buffer *cmd_buffer,
4443                           struct mi_builder *b,
4444                           uint32_t draw_index,
4445                           struct mi_value max)
4446 {
4447    struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
4448    pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
4449 
4450 #if GFX_VER >= 8
4451    mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
4452 #else
4453    /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
4454     * so we emit MI_PREDICATE to set it.
4455     */
4456 
4457    mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
4458    mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4459 
4460    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4461       mip.LoadOperation    = LOAD_LOADINV;
4462       mip.CombineOperation = COMBINE_SET;
4463       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4464    }
4465 #endif
4466 }
4467 #endif
4468 
4469 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4470 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
4471                                struct mi_builder *b,
4472                                uint32_t draw_index,
4473                                struct mi_value max)
4474 {
4475 #if GFX_VERx10 >= 75
4476    if (cmd_buffer->state.conditional_render_enabled) {
4477       emit_draw_count_predicate_with_conditional_render(
4478             cmd_buffer, b, draw_index, mi_value_ref(b, max));
4479    } else {
4480       emit_draw_count_predicate(cmd_buffer, b, draw_index);
4481    }
4482 #else
4483    emit_draw_count_predicate(cmd_buffer, b, draw_index);
4484 #endif
4485 }
4486 
genX(CmdDrawIndirectCount)4487 void genX(CmdDrawIndirectCount)(
4488     VkCommandBuffer                             commandBuffer,
4489     VkBuffer                                    _buffer,
4490     VkDeviceSize                                offset,
4491     VkBuffer                                    _countBuffer,
4492     VkDeviceSize                                countBufferOffset,
4493     uint32_t                                    maxDrawCount,
4494     uint32_t                                    stride)
4495 {
4496    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4497    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4498    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4499    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4500    struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4501    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4502 
4503    if (anv_batch_has_error(&cmd_buffer->batch))
4504       return;
4505 
4506    genX(cmd_buffer_flush_state)(cmd_buffer);
4507 
4508    struct mi_builder b;
4509    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4510    struct mi_value max =
4511       prepare_for_draw_count_predicate(cmd_buffer, &b,
4512                                        count_buffer, countBufferOffset);
4513 
4514    for (uint32_t i = 0; i < maxDrawCount; i++) {
4515       struct anv_address draw = anv_address_add(buffer->address, offset);
4516 
4517       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4518 
4519       if (vs_prog_data->uses_firstvertex ||
4520           vs_prog_data->uses_baseinstance)
4521          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4522       if (vs_prog_data->uses_drawid)
4523          emit_draw_index(cmd_buffer, i);
4524 
4525       /* Emitting draw index or vertex index BOs may result in needing
4526        * additional VF cache flushes.
4527        */
4528       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4529 
4530       load_indirect_parameters(cmd_buffer, draw, false);
4531 
4532       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4533          prim.IndirectParameterEnable  = true;
4534          prim.PredicateEnable          = true;
4535          prim.VertexAccessType         = SEQUENTIAL;
4536          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4537       }
4538 
4539       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4540 
4541       offset += stride;
4542    }
4543 
4544    mi_value_unref(&b, max);
4545 }
4546 
genX(CmdDrawIndexedIndirectCount)4547 void genX(CmdDrawIndexedIndirectCount)(
4548     VkCommandBuffer                             commandBuffer,
4549     VkBuffer                                    _buffer,
4550     VkDeviceSize                                offset,
4551     VkBuffer                                    _countBuffer,
4552     VkDeviceSize                                countBufferOffset,
4553     uint32_t                                    maxDrawCount,
4554     uint32_t                                    stride)
4555 {
4556    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4557    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4558    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4559    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4560    struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4561    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4562 
4563    if (anv_batch_has_error(&cmd_buffer->batch))
4564       return;
4565 
4566    genX(cmd_buffer_flush_state)(cmd_buffer);
4567 
4568    struct mi_builder b;
4569    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4570    struct mi_value max =
4571       prepare_for_draw_count_predicate(cmd_buffer, &b,
4572                                        count_buffer, countBufferOffset);
4573 
4574    for (uint32_t i = 0; i < maxDrawCount; i++) {
4575       struct anv_address draw = anv_address_add(buffer->address, offset);
4576 
4577       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4578 
4579       /* TODO: We need to stomp base vertex to 0 somehow */
4580       if (vs_prog_data->uses_firstvertex ||
4581           vs_prog_data->uses_baseinstance)
4582          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4583       if (vs_prog_data->uses_drawid)
4584          emit_draw_index(cmd_buffer, i);
4585 
4586       /* Emitting draw index or vertex index BOs may result in needing
4587        * additional VF cache flushes.
4588        */
4589       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4590 
4591       load_indirect_parameters(cmd_buffer, draw, true);
4592 
4593       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4594          prim.IndirectParameterEnable  = true;
4595          prim.PredicateEnable          = true;
4596          prim.VertexAccessType         = RANDOM;
4597          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4598       }
4599 
4600       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4601 
4602       offset += stride;
4603    }
4604 
4605    mi_value_unref(&b, max);
4606 }
4607 
genX(CmdBeginTransformFeedbackEXT)4608 void genX(CmdBeginTransformFeedbackEXT)(
4609     VkCommandBuffer                             commandBuffer,
4610     uint32_t                                    firstCounterBuffer,
4611     uint32_t                                    counterBufferCount,
4612     const VkBuffer*                             pCounterBuffers,
4613     const VkDeviceSize*                         pCounterBufferOffsets)
4614 {
4615    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4616 
4617    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4618    assert(counterBufferCount <= MAX_XFB_BUFFERS);
4619    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4620 
4621    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4622     *
4623     *    "Ssoftware must ensure that no HW stream output operations can be in
4624     *    process or otherwise pending at the point that the MI_LOAD/STORE
4625     *    commands are processed. This will likely require a pipeline flush."
4626     */
4627    anv_add_pending_pipe_bits(cmd_buffer,
4628                              ANV_PIPE_CS_STALL_BIT,
4629                              "begin transform feedback");
4630    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4631 
4632    for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
4633       /* If we have a counter buffer, this is a resume so we need to load the
4634        * value into the streamout offset register.  Otherwise, this is a begin
4635        * and we need to reset it to zero.
4636        */
4637       if (pCounterBuffers &&
4638           idx >= firstCounterBuffer &&
4639           idx - firstCounterBuffer < counterBufferCount &&
4640           pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
4641          uint32_t cb_idx = idx - firstCounterBuffer;
4642          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4643          uint64_t offset = pCounterBufferOffsets ?
4644                            pCounterBufferOffsets[cb_idx] : 0;
4645 
4646          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
4647             lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4648             lrm.MemoryAddress    = anv_address_add(counter_buffer->address,
4649                                                    offset);
4650          }
4651       } else {
4652          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
4653             lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4654             lri.DataDWord        = 0;
4655          }
4656       }
4657    }
4658 
4659    cmd_buffer->state.xfb_enabled = true;
4660    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4661 }
4662 
genX(CmdEndTransformFeedbackEXT)4663 void genX(CmdEndTransformFeedbackEXT)(
4664     VkCommandBuffer                             commandBuffer,
4665     uint32_t                                    firstCounterBuffer,
4666     uint32_t                                    counterBufferCount,
4667     const VkBuffer*                             pCounterBuffers,
4668     const VkDeviceSize*                         pCounterBufferOffsets)
4669 {
4670    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4671 
4672    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4673    assert(counterBufferCount <= MAX_XFB_BUFFERS);
4674    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4675 
4676    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4677     *
4678     *    "Ssoftware must ensure that no HW stream output operations can be in
4679     *    process or otherwise pending at the point that the MI_LOAD/STORE
4680     *    commands are processed. This will likely require a pipeline flush."
4681     */
4682    anv_add_pending_pipe_bits(cmd_buffer,
4683                              ANV_PIPE_CS_STALL_BIT,
4684                              "end transform feedback");
4685    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4686 
4687    for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
4688       unsigned idx = firstCounterBuffer + cb_idx;
4689 
4690       /* If we have a counter buffer, this is a resume so we need to load the
4691        * value into the streamout offset register.  Otherwise, this is a begin
4692        * and we need to reset it to zero.
4693        */
4694       if (pCounterBuffers &&
4695           cb_idx < counterBufferCount &&
4696           pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
4697          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4698          uint64_t offset = pCounterBufferOffsets ?
4699                            pCounterBufferOffsets[cb_idx] : 0;
4700 
4701          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
4702             srm.MemoryAddress    = anv_address_add(counter_buffer->address,
4703                                                    offset);
4704             srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4705          }
4706       }
4707    }
4708 
4709    cmd_buffer->state.xfb_enabled = false;
4710    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4711 }
4712 
4713 void
genX(cmd_buffer_flush_compute_state)4714 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
4715 {
4716    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
4717    struct anv_compute_pipeline *pipeline = comp_state->pipeline;
4718 
4719    assert(pipeline->cs);
4720 
4721    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
4722 
4723    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
4724 
4725    /* Apply any pending pipeline flushes we may have.  We want to apply them
4726     * now because, if any of those flushes are for things like push constants,
4727     * the GPU will read the state at weird times.
4728     */
4729    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4730 
4731    if (cmd_buffer->state.compute.pipeline_dirty) {
4732       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
4733        *
4734        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4735        *    the only bits that are changed are scoreboard related: Scoreboard
4736        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4737        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
4738        *    sufficient."
4739        */
4740       anv_add_pending_pipe_bits(cmd_buffer,
4741                               ANV_PIPE_CS_STALL_BIT,
4742                               "flush compute state");
4743       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4744 
4745       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
4746 
4747       /* The workgroup size of the pipeline affects our push constant layout
4748        * so flag push constants as dirty if we change the pipeline.
4749        */
4750       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4751    }
4752 
4753    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
4754        cmd_buffer->state.compute.pipeline_dirty) {
4755       flush_descriptor_sets(cmd_buffer,
4756                             &cmd_buffer->state.compute.base,
4757                             VK_SHADER_STAGE_COMPUTE_BIT,
4758                             &pipeline->cs, 1);
4759       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4760 
4761 #if GFX_VERx10 < 125
4762       uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
4763       struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
4764          .BindingTablePointer =
4765             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
4766          .SamplerStatePointer =
4767             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
4768       };
4769       GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
4770 
4771       struct anv_state state =
4772          anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
4773                                       pipeline->interface_descriptor_data,
4774                                       GENX(INTERFACE_DESCRIPTOR_DATA_length),
4775                                       64);
4776 
4777       uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4778       anv_batch_emit(&cmd_buffer->batch,
4779                      GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
4780          mid.InterfaceDescriptorTotalLength        = size;
4781          mid.InterfaceDescriptorDataStartAddress   = state.offset;
4782       }
4783 #endif
4784    }
4785 
4786    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
4787       comp_state->push_data =
4788          anv_cmd_buffer_cs_push_constants(cmd_buffer);
4789 
4790 #if GFX_VERx10 < 125
4791       if (comp_state->push_data.alloc_size) {
4792          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
4793             curbe.CURBETotalDataLength    = comp_state->push_data.alloc_size;
4794             curbe.CURBEDataStartAddress   = comp_state->push_data.offset;
4795          }
4796       }
4797 #endif
4798 
4799       cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4800    }
4801 
4802    cmd_buffer->state.compute.pipeline_dirty = false;
4803 
4804    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4805 }
4806 
4807 #if GFX_VER == 7
4808 
4809 static VkResult
verify_cmd_parser(const struct anv_device * device,int required_version,const char * function)4810 verify_cmd_parser(const struct anv_device *device,
4811                   int required_version,
4812                   const char *function)
4813 {
4814    if (device->physical->cmd_parser_version < required_version) {
4815       return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT,
4816                        "cmd parser version %d is required for %s",
4817                        required_version, function);
4818    } else {
4819       return VK_SUCCESS;
4820    }
4821 }
4822 
4823 #endif
4824 
4825 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)4826 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
4827                                   uint32_t baseGroupX,
4828                                   uint32_t baseGroupY,
4829                                   uint32_t baseGroupZ)
4830 {
4831    if (anv_batch_has_error(&cmd_buffer->batch))
4832       return;
4833 
4834    struct anv_push_constants *push =
4835       &cmd_buffer->state.compute.base.push_constants;
4836    if (push->cs.base_work_group_id[0] != baseGroupX ||
4837        push->cs.base_work_group_id[1] != baseGroupY ||
4838        push->cs.base_work_group_id[2] != baseGroupZ) {
4839       push->cs.base_work_group_id[0] = baseGroupX;
4840       push->cs.base_work_group_id[1] = baseGroupY;
4841       push->cs.base_work_group_id[2] = baseGroupZ;
4842 
4843       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4844    }
4845 }
4846 
genX(CmdDispatch)4847 void genX(CmdDispatch)(
4848     VkCommandBuffer                             commandBuffer,
4849     uint32_t                                    x,
4850     uint32_t                                    y,
4851     uint32_t                                    z)
4852 {
4853    genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z);
4854 }
4855 
4856 #if GFX_VERx10 >= 125
4857 
4858 static inline void
emit_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4859 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
4860                     const struct anv_compute_pipeline *pipeline, bool indirect,
4861                     const struct brw_cs_prog_data *prog_data,
4862                     uint32_t groupCountX, uint32_t groupCountY,
4863                     uint32_t groupCountZ)
4864 {
4865    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
4866    const struct anv_shader_bin *cs_bin = pipeline->cs;
4867    bool predicate = cmd_buffer->state.conditional_render_enabled;
4868 
4869    const struct intel_device_info *devinfo = &pipeline->base.device->info;
4870    const struct brw_cs_dispatch_info dispatch =
4871       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
4872 
4873    anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
4874       cw.IndirectParameterEnable        = indirect;
4875       cw.PredicateEnable                = predicate;
4876       cw.SIMDSize                       = dispatch.simd_size / 16;
4877       cw.IndirectDataStartAddress       = comp_state->push_data.offset;
4878       cw.IndirectDataLength             = comp_state->push_data.alloc_size;
4879       cw.LocalXMaximum                  = prog_data->local_size[0] - 1;
4880       cw.LocalYMaximum                  = prog_data->local_size[1] - 1;
4881       cw.LocalZMaximum                  = prog_data->local_size[2] - 1;
4882       cw.ThreadGroupIDXDimension        = groupCountX;
4883       cw.ThreadGroupIDYDimension        = groupCountY;
4884       cw.ThreadGroupIDZDimension        = groupCountZ;
4885       cw.ExecutionMask                  = dispatch.right_mask;
4886 
4887       cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
4888          .KernelStartPointer = cs_bin->kernel.offset,
4889          .SamplerStatePointer =
4890             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
4891          .BindingTablePointer =
4892             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
4893          .BindingTableEntryCount =
4894             1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
4895          .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
4896          .SharedLocalMemorySize = encode_slm_size(GFX_VER,
4897                                                   prog_data->base.total_shared),
4898          .NumberOfBarriers = prog_data->uses_barrier,
4899       };
4900    }
4901 }
4902 
4903 #else /* #if GFX_VERx10 >= 125 */
4904 
4905 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4906 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
4907                   const struct anv_compute_pipeline *pipeline, bool indirect,
4908                   const struct brw_cs_prog_data *prog_data,
4909                   uint32_t groupCountX, uint32_t groupCountY,
4910                   uint32_t groupCountZ)
4911 {
4912    bool predicate = (GFX_VER <= 7 && indirect) ||
4913       cmd_buffer->state.conditional_render_enabled;
4914 
4915    const struct intel_device_info *devinfo = &pipeline->base.device->info;
4916    const struct brw_cs_dispatch_info dispatch =
4917       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
4918 
4919    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
4920       ggw.IndirectParameterEnable      = indirect;
4921       ggw.PredicateEnable              = predicate;
4922       ggw.SIMDSize                     = dispatch.simd_size / 16;
4923       ggw.ThreadDepthCounterMaximum    = 0;
4924       ggw.ThreadHeightCounterMaximum   = 0;
4925       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
4926       ggw.ThreadGroupIDXDimension      = groupCountX;
4927       ggw.ThreadGroupIDYDimension      = groupCountY;
4928       ggw.ThreadGroupIDZDimension      = groupCountZ;
4929       ggw.RightExecutionMask           = dispatch.right_mask;
4930       ggw.BottomExecutionMask          = 0xffffffff;
4931    }
4932 
4933    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
4934 }
4935 
4936 #endif /* #if GFX_VERx10 >= 125 */
4937 
4938 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4939 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
4940                const struct anv_compute_pipeline *pipeline, bool indirect,
4941                const struct brw_cs_prog_data *prog_data,
4942                uint32_t groupCountX, uint32_t groupCountY,
4943                uint32_t groupCountZ)
4944 {
4945 #if GFX_VERx10 >= 125
4946    emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
4947                        groupCountY, groupCountZ);
4948 #else
4949    emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
4950                      groupCountY, groupCountZ);
4951 #endif
4952 }
4953 
genX(CmdDispatchBase)4954 void genX(CmdDispatchBase)(
4955     VkCommandBuffer                             commandBuffer,
4956     uint32_t                                    baseGroupX,
4957     uint32_t                                    baseGroupY,
4958     uint32_t                                    baseGroupZ,
4959     uint32_t                                    groupCountX,
4960     uint32_t                                    groupCountY,
4961     uint32_t                                    groupCountZ)
4962 {
4963    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4964    struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4965    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
4966 
4967    anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
4968                                      baseGroupY, baseGroupZ);
4969 
4970    if (anv_batch_has_error(&cmd_buffer->batch))
4971       return;
4972 
4973    anv_measure_snapshot(cmd_buffer,
4974                         INTEL_SNAPSHOT_COMPUTE,
4975                         "compute",
4976                         groupCountX * groupCountY * groupCountZ *
4977                         prog_data->local_size[0] * prog_data->local_size[1] *
4978                         prog_data->local_size[2]);
4979 
4980    if (prog_data->uses_num_work_groups) {
4981       struct anv_state state =
4982          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
4983       uint32_t *sizes = state.map;
4984       sizes[0] = groupCountX;
4985       sizes[1] = groupCountY;
4986       sizes[2] = groupCountZ;
4987       cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
4988          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4989          .offset = state.offset,
4990       };
4991 
4992       /* The num_workgroups buffer goes in the binding table */
4993       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4994    }
4995 
4996    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
4997 
4998    if (cmd_buffer->state.conditional_render_enabled)
4999       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5000 
5001    emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
5002                   groupCountY, groupCountZ);
5003 }
5004 
5005 #define GPGPU_DISPATCHDIMX 0x2500
5006 #define GPGPU_DISPATCHDIMY 0x2504
5007 #define GPGPU_DISPATCHDIMZ 0x2508
5008 
genX(CmdDispatchIndirect)5009 void genX(CmdDispatchIndirect)(
5010     VkCommandBuffer                             commandBuffer,
5011     VkBuffer                                    _buffer,
5012     VkDeviceSize                                offset)
5013 {
5014    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5015    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5016    struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
5017    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
5018    struct anv_address addr = anv_address_add(buffer->address, offset);
5019    UNUSED struct anv_batch *batch = &cmd_buffer->batch;
5020 
5021    anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
5022 
5023 #if GFX_VER == 7
5024    /* Linux 4.4 added command parser version 5 which allows the GPGPU
5025     * indirect dispatch registers to be written.
5026     */
5027    if (verify_cmd_parser(cmd_buffer->device, 5,
5028                          "vkCmdDispatchIndirect") != VK_SUCCESS)
5029       return;
5030 #endif
5031 
5032    anv_measure_snapshot(cmd_buffer,
5033                         INTEL_SNAPSHOT_COMPUTE,
5034                         "compute indirect",
5035                         0);
5036 
5037    if (prog_data->uses_num_work_groups) {
5038       cmd_buffer->state.compute.num_workgroups = addr;
5039 
5040       /* The num_workgroups buffer goes in the binding table */
5041       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5042    }
5043 
5044    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
5045 
5046    struct mi_builder b;
5047    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5048 
5049    struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
5050    struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
5051    struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
5052 
5053    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
5054    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
5055    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
5056 
5057 #if GFX_VER <= 7
5058    /* predicate = (compute_dispatch_indirect_x_size == 0); */
5059    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
5060    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5061    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5062       mip.LoadOperation    = LOAD_LOAD;
5063       mip.CombineOperation = COMBINE_SET;
5064       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5065    }
5066 
5067    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
5068    mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
5069    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5070       mip.LoadOperation    = LOAD_LOAD;
5071       mip.CombineOperation = COMBINE_OR;
5072       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5073    }
5074 
5075    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
5076    mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
5077    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5078       mip.LoadOperation    = LOAD_LOAD;
5079       mip.CombineOperation = COMBINE_OR;
5080       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5081    }
5082 
5083    /* predicate = !predicate; */
5084    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5085       mip.LoadOperation    = LOAD_LOADINV;
5086       mip.CombineOperation = COMBINE_OR;
5087       mip.CompareOperation = COMPARE_FALSE;
5088    }
5089 
5090 #if GFX_VERx10 == 75
5091    if (cmd_buffer->state.conditional_render_enabled) {
5092       /* predicate &= !(conditional_rendering_predicate == 0); */
5093       mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
5094                    mi_reg32(ANV_PREDICATE_RESULT_REG));
5095       anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5096          mip.LoadOperation    = LOAD_LOADINV;
5097          mip.CombineOperation = COMBINE_AND;
5098          mip.CompareOperation = COMPARE_SRCS_EQUAL;
5099       }
5100    }
5101 #endif
5102 
5103 #else /* GFX_VER > 7 */
5104    if (cmd_buffer->state.conditional_render_enabled)
5105       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5106 #endif
5107 
5108    emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
5109 }
5110 
5111 #if GFX_VERx10 >= 125
5112 static void
calc_local_trace_size(uint8_t local_shift[3],const uint32_t global[3])5113 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
5114 {
5115    unsigned total_shift = 0;
5116    memset(local_shift, 0, 3);
5117 
5118    bool progress;
5119    do {
5120       progress = false;
5121       for (unsigned i = 0; i < 3; i++) {
5122          assert(global[i] > 0);
5123          if ((1 << local_shift[i]) < global[i]) {
5124             progress = true;
5125             local_shift[i]++;
5126             total_shift++;
5127          }
5128 
5129          if (total_shift == 3)
5130             return;
5131       }
5132    } while(progress);
5133 
5134    /* Assign whatever's left to x */
5135    local_shift[0] += 3 - total_shift;
5136 }
5137 
5138 static struct GFX_RT_SHADER_TABLE
vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR * region)5139 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
5140 {
5141    return (struct GFX_RT_SHADER_TABLE) {
5142       .BaseAddress = anv_address_from_u64(region->deviceAddress),
5143       .Stride = region->stride,
5144    };
5145 }
5146 
5147 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer * cmd_buffer,const VkStridedDeviceAddressRegionKHR * raygen_sbt,const VkStridedDeviceAddressRegionKHR * miss_sbt,const VkStridedDeviceAddressRegionKHR * hit_sbt,const VkStridedDeviceAddressRegionKHR * callable_sbt,bool is_indirect,uint32_t launch_width,uint32_t launch_height,uint32_t launch_depth,uint64_t launch_size_addr)5148 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
5149                       const VkStridedDeviceAddressRegionKHR *raygen_sbt,
5150                       const VkStridedDeviceAddressRegionKHR *miss_sbt,
5151                       const VkStridedDeviceAddressRegionKHR *hit_sbt,
5152                       const VkStridedDeviceAddressRegionKHR *callable_sbt,
5153                       bool is_indirect,
5154                       uint32_t launch_width,
5155                       uint32_t launch_height,
5156                       uint32_t launch_depth,
5157                       uint64_t launch_size_addr)
5158 {
5159    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
5160    struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;
5161 
5162    if (anv_batch_has_error(&cmd_buffer->batch))
5163       return;
5164 
5165    /* If we have a known degenerate launch size, just bail */
5166    if (!is_indirect &&
5167        (launch_width == 0 || launch_height == 0 || launch_depth == 0))
5168       return;
5169 
5170    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
5171    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
5172 
5173    cmd_buffer->state.rt.pipeline_dirty = false;
5174 
5175    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5176 
5177    /* Add these to the reloc list as they're internal buffers that don't
5178     * actually have relocs to pick them up manually.
5179     *
5180     * TODO(RT): This is a bit of a hack
5181     */
5182    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
5183                          cmd_buffer->batch.alloc,
5184                          rt->scratch.bo);
5185 
5186    /* Allocate and set up our RT_DISPATCH_GLOBALS */
5187    struct anv_state rtdg_state =
5188       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
5189                                          BRW_RT_PUSH_CONST_OFFSET +
5190                                          sizeof(struct anv_push_constants),
5191                                          64);
5192 
5193    struct GFX_RT_DISPATCH_GLOBALS rtdg = {
5194       .MemBaseAddress = (struct anv_address) {
5195          .bo = rt->scratch.bo,
5196          .offset = rt->scratch.layout.ray_stack_start,
5197       },
5198       .CallStackHandler =
5199          anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0),
5200       .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
5201       .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
5202       .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
5203       .Flags = RT_DEPTH_TEST_LESS_EQUAL,
5204       .HitGroupTable = vk_sdar_to_shader_table(hit_sbt),
5205       .MissGroupTable = vk_sdar_to_shader_table(miss_sbt),
5206       .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
5207       .LaunchWidth = launch_width,
5208       .LaunchHeight = launch_height,
5209       .LaunchDepth = launch_depth,
5210       .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),
5211    };
5212    GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);
5213 
5214    /* Push constants go after the RT_DISPATCH_GLOBALS */
5215    assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);
5216    memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
5217           &cmd_buffer->state.rt.base.push_constants,
5218           sizeof(struct anv_push_constants));
5219 
5220    struct anv_address rtdg_addr = {
5221       .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5222       .offset = rtdg_state.offset,
5223    };
5224 
5225    uint8_t local_size_log2[3];
5226    uint32_t global_size[3] = {};
5227    if (is_indirect) {
5228       /* Pick a local size that's probably ok.  We assume most TraceRays calls
5229        * will use a two-dimensional dispatch size.  Worst case, our initial
5230        * dispatch will be a little slower than it has to be.
5231        */
5232       local_size_log2[0] = 2;
5233       local_size_log2[1] = 1;
5234       local_size_log2[2] = 0;
5235 
5236       struct mi_builder b;
5237       mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5238 
5239       struct mi_value launch_size[3] = {
5240          mi_mem32(anv_address_from_u64(launch_size_addr + 0)),
5241          mi_mem32(anv_address_from_u64(launch_size_addr + 4)),
5242          mi_mem32(anv_address_from_u64(launch_size_addr + 8)),
5243       };
5244 
5245       /* Store the original launch size into RT_DISPATCH_GLOBALS
5246        *
5247        * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets
5248        * moved into a genX version.
5249        */
5250       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),
5251                mi_value_ref(&b, launch_size[0]));
5252       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),
5253                mi_value_ref(&b, launch_size[1]));
5254       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),
5255                mi_value_ref(&b, launch_size[2]));
5256 
5257       /* Compute the global dispatch size */
5258       for (unsigned i = 0; i < 3; i++) {
5259          if (local_size_log2[i] == 0)
5260             continue;
5261 
5262          /* global_size = DIV_ROUND_UP(launch_size, local_size)
5263           *
5264           * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
5265           * has the semantics of shifting the enture 64-bit value and taking
5266           * the bottom 32 so we don't have to worry about roll-over.
5267           */
5268          uint32_t local_size = 1 << local_size_log2[i];
5269          launch_size[i] = mi_iadd(&b, launch_size[i],
5270                                       mi_imm(local_size - 1));
5271          launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
5272                                             local_size_log2[i]);
5273       }
5274 
5275       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
5276       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
5277       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
5278    } else {
5279       uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };
5280       calc_local_trace_size(local_size_log2, launch_size);
5281 
5282       for (unsigned i = 0; i < 3; i++) {
5283          /* We have to be a bit careful here because DIV_ROUND_UP adds to the
5284           * numerator value may overflow.  Cast to uint64_t to avoid this.
5285           */
5286          uint32_t local_size = 1 << local_size_log2[i];
5287          global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);
5288       }
5289    }
5290 
5291    anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
5292       cw.IndirectParameterEnable        = is_indirect;
5293       cw.PredicateEnable                = false;
5294       cw.SIMDSize                       = SIMD8;
5295       cw.LocalXMaximum                  = (1 << local_size_log2[0]) - 1;
5296       cw.LocalYMaximum                  = (1 << local_size_log2[1]) - 1;
5297       cw.LocalZMaximum                  = (1 << local_size_log2[2]) - 1;
5298       cw.ThreadGroupIDXDimension        = global_size[0];
5299       cw.ThreadGroupIDYDimension        = global_size[1];
5300       cw.ThreadGroupIDZDimension        = global_size[2];
5301       cw.ExecutionMask                  = 0xff;
5302       cw.EmitInlineParameter            = true;
5303 
5304       const gl_shader_stage s = MESA_SHADER_RAYGEN;
5305       struct anv_device *device = cmd_buffer->device;
5306       struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
5307       struct anv_state *samplers = &cmd_buffer->state.samplers[s];
5308       cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
5309          .KernelStartPointer = device->rt_trampoline->kernel.offset,
5310          .SamplerStatePointer = samplers->offset,
5311          /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
5312          .SamplerCount = 0,
5313          .BindingTablePointer = surfaces->offset,
5314          .NumberofThreadsinGPGPUThreadGroup = 1,
5315          .BTDMode = true,
5316       };
5317 
5318       struct brw_rt_raygen_trampoline_params trampoline_params = {
5319          .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
5320          .raygen_bsr_addr = raygen_sbt->deviceAddress,
5321          .is_indirect = is_indirect,
5322          .local_group_size_log2 = {
5323             local_size_log2[0],
5324             local_size_log2[1],
5325             local_size_log2[2],
5326          },
5327       };
5328       STATIC_ASSERT(sizeof(trampoline_params) == 32);
5329       memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
5330    }
5331 }
5332 
5333 void
genX(CmdTraceRaysKHR)5334 genX(CmdTraceRaysKHR)(
5335     VkCommandBuffer                             commandBuffer,
5336     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
5337     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
5338     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
5339     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
5340     uint32_t                                    width,
5341     uint32_t                                    height,
5342     uint32_t                                    depth)
5343 {
5344    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5345 
5346    cmd_buffer_trace_rays(cmd_buffer,
5347                          pRaygenShaderBindingTable,
5348                          pMissShaderBindingTable,
5349                          pHitShaderBindingTable,
5350                          pCallableShaderBindingTable,
5351                          false /* is_indirect */,
5352                          width, height, depth,
5353                          0 /* launch_size_addr */);
5354 }
5355 
5356 void
genX(CmdTraceRaysIndirectKHR)5357 genX(CmdTraceRaysIndirectKHR)(
5358     VkCommandBuffer                             commandBuffer,
5359     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
5360     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
5361     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
5362     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
5363     VkDeviceAddress                             indirectDeviceAddress)
5364 {
5365    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5366 
5367    cmd_buffer_trace_rays(cmd_buffer,
5368                          pRaygenShaderBindingTable,
5369                          pMissShaderBindingTable,
5370                          pHitShaderBindingTable,
5371                          pCallableShaderBindingTable,
5372                          true /* is_indirect */,
5373                          0, 0, 0, /* width, height, depth, */
5374                          indirectDeviceAddress);
5375 }
5376 #endif /* GFX_VERx10 >= 125 */
5377 
5378 static void
genX(flush_pipeline_select)5379 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
5380                             uint32_t pipeline)
5381 {
5382    UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;
5383 
5384    if (cmd_buffer->state.current_pipeline == pipeline)
5385       return;
5386 
5387 #if GFX_VER >= 8 && GFX_VER < 10
5388    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
5389     *
5390     *   Software must clear the COLOR_CALC_STATE Valid field in
5391     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
5392     *   with Pipeline Select set to GPGPU.
5393     *
5394     * The internal hardware docs recommend the same workaround for Gfx9
5395     * hardware too.
5396     */
5397    if (pipeline == GPGPU)
5398       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
5399 #endif
5400 
5401 #if GFX_VER == 9
5402    if (pipeline == _3D) {
5403       /* There is a mid-object preemption workaround which requires you to
5404        * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D.  However,
5405        * even without preemption, we have issues with geometry flickering when
5406        * GPGPU and 3D are back-to-back and this seems to fix it.  We don't
5407        * really know why.
5408        */
5409       anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
5410          vfe.MaximumNumberofThreads =
5411             devinfo->max_cs_threads * devinfo->subslice_total - 1;
5412          vfe.NumberofURBEntries     = 2;
5413          vfe.URBEntryAllocationSize = 2;
5414       }
5415 
5416       /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
5417        * invalid. Set the compute pipeline to dirty to force a re-emit of the
5418        * pipeline in case we get back-to-back dispatch calls with the same
5419        * pipeline and a PIPELINE_SELECT in between.
5420        */
5421       cmd_buffer->state.compute.pipeline_dirty = true;
5422    }
5423 #endif
5424 
5425    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
5426     * PIPELINE_SELECT [DevBWR+]":
5427     *
5428     *   Project: DEVSNB+
5429     *
5430     *   Software must ensure all the write caches are flushed through a
5431     *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
5432     *   command to invalidate read only caches prior to programming
5433     *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
5434     */
5435    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5436       pc.RenderTargetCacheFlushEnable  = true;
5437       pc.DepthCacheFlushEnable         = true;
5438 #if GFX_VER >= 12
5439       pc.HDCPipelineFlushEnable        = true;
5440 #else
5441       pc.DCFlushEnable                 = true;
5442 #endif
5443       pc.PostSyncOperation             = NoWrite;
5444       pc.CommandStreamerStallEnable    = true;
5445 #if GFX_VER >= 12
5446       /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must be
5447        * set with any PIPE_CONTROL with Depth Flush Enable bit set.
5448        */
5449       pc.DepthStallEnable = true;
5450 #endif
5451       anv_debug_dump_pc(pc);
5452    }
5453 
5454    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5455       pc.TextureCacheInvalidationEnable   = true;
5456       pc.ConstantCacheInvalidationEnable  = true;
5457       pc.StateCacheInvalidationEnable     = true;
5458       pc.InstructionCacheInvalidateEnable = true;
5459       pc.PostSyncOperation                = NoWrite;
5460       anv_debug_dump_pc(pc);
5461    }
5462 
5463    anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
5464 #if GFX_VER >= 9
5465       ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
5466       ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
5467 #endif
5468       ps.PipelineSelection = pipeline;
5469    }
5470 
5471 #if GFX_VER == 9
5472    if (devinfo->is_geminilake) {
5473       /* Project: DevGLK
5474        *
5475        * "This chicken bit works around a hardware issue with barrier logic
5476        *  encountered when switching between GPGPU and 3D pipelines.  To
5477        *  workaround the issue, this mode bit should be set after a pipeline
5478        *  is selected."
5479        */
5480       anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
5481          scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
5482                                                   : GLK_BARRIER_MODE_3D_HULL;
5483          scec1.GLKBarrierModeMask = 1;
5484       }
5485    }
5486 #endif
5487 
5488    cmd_buffer->state.current_pipeline = pipeline;
5489 }
5490 
5491 void
genX(flush_pipeline_select_3d)5492 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
5493 {
5494    genX(flush_pipeline_select)(cmd_buffer, _3D);
5495 }
5496 
5497 void
genX(flush_pipeline_select_gpgpu)5498 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
5499 {
5500    genX(flush_pipeline_select)(cmd_buffer, GPGPU);
5501 }
5502 
5503 void
genX(cmd_buffer_emit_gfx7_depth_flush)5504 genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
5505 {
5506    if (GFX_VER >= 8)
5507       return;
5508 
5509    /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
5510     *
5511     *    "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
5512     *    combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
5513     *    3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
5514     *    issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
5515     *    set), followed by a pipelined depth cache flush (PIPE_CONTROL with
5516     *    Depth Flush Bit set, followed by another pipelined depth stall
5517     *    (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
5518     *    guarantee that the pipeline from WM onwards is already flushed (e.g.,
5519     *    via a preceding MI_FLUSH)."
5520     */
5521    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
5522       pipe.DepthStallEnable = true;
5523       anv_debug_dump_pc(pipe);
5524    }
5525    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
5526       pipe.DepthCacheFlushEnable = true;
5527 #if GFX_VER >= 12
5528       pipe.TileCacheFlushEnable = true;
5529 #endif
5530       anv_debug_dump_pc(pipe);
5531    }
5532    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
5533       pipe.DepthStallEnable = true;
5534       anv_debug_dump_pc(pipe);
5535    }
5536 }
5537 
5538 void
genX(cmd_buffer_emit_gfx12_depth_wa)5539 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
5540                                      const struct isl_surf *surf)
5541 {
5542 #if GFX_VERx10 == 120
5543    const bool fmt_is_d16 = surf->format == ISL_FORMAT_R16_UNORM;
5544 
5545    switch (cmd_buffer->state.depth_reg_mode) {
5546    case ANV_DEPTH_REG_MODE_HW_DEFAULT:
5547       if (!fmt_is_d16)
5548          return;
5549       break;
5550    case ANV_DEPTH_REG_MODE_D16:
5551       if (fmt_is_d16)
5552          return;
5553       break;
5554    case ANV_DEPTH_REG_MODE_UNKNOWN:
5555       break;
5556    }
5557 
5558    /* We'll change some CHICKEN registers depending on the depth surface
5559     * format. Do a depth flush and stall so the pipeline is not using these
5560     * settings while we change the registers.
5561     */
5562    anv_add_pending_pipe_bits(cmd_buffer,
5563                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
5564                              ANV_PIPE_DEPTH_STALL_BIT |
5565                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
5566                              "Workaround: Stop pipeline for 14010455700");
5567    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5568 
5569    /* Wa_14010455700
5570     *
5571     * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
5572     * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
5573     */
5574    anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
5575       reg.HIZPlaneOptimizationdisablebit = fmt_is_d16 && surf->samples == 1;
5576       reg.HIZPlaneOptimizationdisablebitMask = true;
5577    }
5578 
5579    /* Wa_1806527549
5580     *
5581     * Set HIZ_CHICKEN (7018h) bit 13 = 1 when depth buffer is D16_UNORM.
5582     */
5583    anv_batch_write_reg(&cmd_buffer->batch, GENX(HIZ_CHICKEN), reg) {
5584       reg.HZDepthTestLEGEOptimizationDisable = fmt_is_d16;
5585       reg.HZDepthTestLEGEOptimizationDisableMask = true;
5586    }
5587 
5588    cmd_buffer->state.depth_reg_mode =
5589       fmt_is_d16 ? ANV_DEPTH_REG_MODE_D16 : ANV_DEPTH_REG_MODE_HW_DEFAULT;
5590 #endif
5591 }
5592 
5593 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
5594  *
5595  *    "The VF cache needs to be invalidated before binding and then using
5596  *    Vertex Buffers that overlap with any previously bound Vertex Buffer
5597  *    (at a 64B granularity) since the last invalidation.  A VF cache
5598  *    invalidate is performed by setting the "VF Cache Invalidation Enable"
5599  *    bit in PIPE_CONTROL."
5600  *
5601  * This is implemented by carefully tracking all vertex and index buffer
5602  * bindings and flushing if the cache ever ends up with a range in the cache
5603  * that would exceed 4 GiB.  This is implemented in three parts:
5604  *
5605  *    1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
5606  *       every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
5607  *       tracking code of the new binding.  If this new binding would cause
5608  *       the cache to have a too-large range on the next draw call, a pipeline
5609  *       stall and VF cache invalidate are added to pending_pipeline_bits.
5610  *
5611  *    2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
5612  *       empty whenever we emit a VF invalidate.
5613  *
5614  *    3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
5615  *       after every 3DPRIMITIVE and copies the bound range into the dirty
5616  *       range for each used buffer.  This has to be a separate step because
5617  *       we don't always re-bind all buffers and so 1. can't know which
5618  *       buffers are actually bound.
5619  */
5620 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)5621 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
5622                                                int vb_index,
5623                                                struct anv_address vb_address,
5624                                                uint32_t vb_size)
5625 {
5626    if (GFX_VER < 8 || GFX_VER > 9 ||
5627        !anv_use_softpin(cmd_buffer->device->physical))
5628       return;
5629 
5630    struct anv_vb_cache_range *bound, *dirty;
5631    if (vb_index == -1) {
5632       bound = &cmd_buffer->state.gfx.ib_bound_range;
5633       dirty = &cmd_buffer->state.gfx.ib_dirty_range;
5634    } else {
5635       assert(vb_index >= 0);
5636       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
5637       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
5638       bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
5639       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
5640    }
5641 
5642    if (vb_size == 0) {
5643       bound->start = 0;
5644       bound->end = 0;
5645       return;
5646    }
5647 
5648    assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED));
5649    bound->start = intel_48b_address(anv_address_physical(vb_address));
5650    bound->end = bound->start + vb_size;
5651    assert(bound->end > bound->start); /* No overflow */
5652 
5653    /* Align everything to a cache line */
5654    bound->start &= ~(64ull - 1ull);
5655    bound->end = align_u64(bound->end, 64);
5656 
5657    /* Compute the dirty range */
5658    dirty->start = MIN2(dirty->start, bound->start);
5659    dirty->end = MAX2(dirty->end, bound->end);
5660 
5661    /* If our range is larger than 32 bits, we have to flush */
5662    assert(bound->end - bound->start <= (1ull << 32));
5663    if (dirty->end - dirty->start > (1ull << 32)) {
5664       anv_add_pending_pipe_bits(cmd_buffer,
5665                                 ANV_PIPE_CS_STALL_BIT |
5666                                 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
5667                                 "vb > 32b range");
5668    }
5669 }
5670 
5671 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)5672 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
5673                                                     uint32_t access_type,
5674                                                     uint64_t vb_used)
5675 {
5676    if (GFX_VER < 8 || GFX_VER > 9 ||
5677        !anv_use_softpin(cmd_buffer->device->physical))
5678       return;
5679 
5680    if (access_type == RANDOM) {
5681       /* We have an index buffer */
5682       struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
5683       struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
5684 
5685       if (bound->end > bound->start) {
5686          dirty->start = MIN2(dirty->start, bound->start);
5687          dirty->end = MAX2(dirty->end, bound->end);
5688       }
5689    }
5690 
5691    uint64_t mask = vb_used;
5692    while (mask) {
5693       int i = u_bit_scan64(&mask);
5694       assert(i >= 0);
5695       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
5696       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
5697 
5698       struct anv_vb_cache_range *bound, *dirty;
5699       bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
5700       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
5701 
5702       if (bound->end > bound->start) {
5703          dirty->start = MIN2(dirty->start, bound->start);
5704          dirty->end = MAX2(dirty->end, bound->end);
5705       }
5706    }
5707 }
5708 
5709 /**
5710  * Update the pixel hashing modes that determine the balancing of PS threads
5711  * across subslices and slices.
5712  *
5713  * \param width Width bound of the rendering area (already scaled down if \p
5714  *              scale is greater than 1).
5715  * \param height Height bound of the rendering area (already scaled down if \p
5716  *               scale is greater than 1).
5717  * \param scale The number of framebuffer samples that could potentially be
5718  *              affected by an individual channel of the PS thread.  This is
5719  *              typically one for single-sampled rendering, but for operations
5720  *              like CCS resolves and fast clears a single PS invocation may
5721  *              update a huge number of pixels, in which case a finer
5722  *              balancing is desirable in order to maximally utilize the
5723  *              bandwidth available.  UINT_MAX can be used as shorthand for
5724  *              "finest hashing mode available".
5725  */
5726 void
genX(cmd_buffer_emit_hashing_mode)5727 genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
5728                                    unsigned width, unsigned height,
5729                                    unsigned scale)
5730 {
5731 #if GFX_VER == 9
5732    const struct intel_device_info *devinfo = &cmd_buffer->device->info;
5733    const unsigned slice_hashing[] = {
5734       /* Because all Gfx9 platforms with more than one slice require
5735        * three-way subslice hashing, a single "normal" 16x16 slice hashing
5736        * block is guaranteed to suffer from substantial imbalance, with one
5737        * subslice receiving twice as much work as the other two in the
5738        * slice.
5739        *
5740        * The performance impact of that would be particularly severe when
5741        * three-way hashing is also in use for slice balancing (which is the
5742        * case for all Gfx9 GT4 platforms), because one of the slices
5743        * receives one every three 16x16 blocks in either direction, which
5744        * is roughly the periodicity of the underlying subslice imbalance
5745        * pattern ("roughly" because in reality the hardware's
5746        * implementation of three-way hashing doesn't do exact modulo 3
5747        * arithmetic, which somewhat decreases the magnitude of this effect
5748        * in practice).  This leads to a systematic subslice imbalance
5749        * within that slice regardless of the size of the primitive.  The
5750        * 32x32 hashing mode guarantees that the subslice imbalance within a
5751        * single slice hashing block is minimal, largely eliminating this
5752        * effect.
5753        */
5754       _32x32,
5755       /* Finest slice hashing mode available. */
5756       NORMAL
5757    };
5758    const unsigned subslice_hashing[] = {
5759       /* 16x16 would provide a slight cache locality benefit especially
5760        * visible in the sampler L1 cache efficiency of low-bandwidth
5761        * non-LLC platforms, but it comes at the cost of greater subslice
5762        * imbalance for primitives of dimensions approximately intermediate
5763        * between 16x4 and 16x16.
5764        */
5765       _16x4,
5766       /* Finest subslice hashing mode available. */
5767       _8x4
5768    };
5769    /* Dimensions of the smallest hashing block of a given hashing mode.  If
5770     * the rendering area is smaller than this there can't possibly be any
5771     * benefit from switching to this mode, so we optimize out the
5772     * transition.
5773     */
5774    const unsigned min_size[][2] = {
5775          { 16, 4 },
5776          { 8, 4 }
5777    };
5778    const unsigned idx = scale > 1;
5779 
5780    if (cmd_buffer->state.current_hash_scale != scale &&
5781        (width > min_size[idx][0] || height > min_size[idx][1])) {
5782       anv_add_pending_pipe_bits(cmd_buffer,
5783                                 ANV_PIPE_CS_STALL_BIT |
5784                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
5785                                 "change pixel hash mode");
5786       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5787 
5788       anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
5789          gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
5790          gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
5791          gt.SubsliceHashing = subslice_hashing[idx];
5792          gt.SubsliceHashingMask = -1;
5793       }
5794 
5795       cmd_buffer->state.current_hash_scale = scale;
5796    }
5797 #endif
5798 }
5799 
5800 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)5801 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
5802 {
5803    struct anv_device *device = cmd_buffer->device;
5804    const struct anv_image_view *iview =
5805       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
5806    const struct anv_image *image = iview ? iview->image : NULL;
5807 
5808    /* FIXME: Width and Height are wrong */
5809 
5810    genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
5811 
5812    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
5813                                         device->isl_dev.ds.size / 4);
5814    if (dw == NULL)
5815       return;
5816 
5817    struct isl_depth_stencil_hiz_emit_info info = { };
5818 
5819    if (iview)
5820       info.view = &iview->planes[0].isl;
5821 
5822    if (image && (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
5823       const uint32_t depth_plane =
5824          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
5825       const struct anv_surface *depth_surface =
5826          &image->planes[depth_plane].primary_surface;
5827       const struct anv_address depth_address =
5828          anv_image_address(image, &depth_surface->memory_range);
5829 
5830       info.depth_surf = &depth_surface->isl;
5831 
5832       info.depth_address =
5833          anv_batch_emit_reloc(&cmd_buffer->batch,
5834                               dw + device->isl_dev.ds.depth_offset / 4,
5835                               depth_address.bo, depth_address.offset);
5836       info.mocs =
5837          anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
5838 
5839       const uint32_t ds =
5840          cmd_buffer->state.subpass->depth_stencil_attachment->attachment;
5841       info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage;
5842       if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
5843          assert(isl_aux_usage_has_hiz(info.hiz_usage));
5844 
5845          const struct anv_surface *hiz_surface =
5846             &image->planes[depth_plane].aux_surface;
5847          const struct anv_address hiz_address =
5848             anv_image_address(image, &hiz_surface->memory_range);
5849 
5850          info.hiz_surf = &hiz_surface->isl;
5851 
5852          info.hiz_address =
5853             anv_batch_emit_reloc(&cmd_buffer->batch,
5854                                  dw + device->isl_dev.ds.hiz_offset / 4,
5855                                  hiz_address.bo, hiz_address.offset);
5856 
5857          info.depth_clear_value = ANV_HZ_FC_VAL;
5858       }
5859    }
5860 
5861    if (image && (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {
5862       const uint32_t stencil_plane =
5863          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
5864       const struct anv_surface *stencil_surface =
5865          &image->planes[stencil_plane].primary_surface;
5866       const struct anv_address stencil_address =
5867          anv_image_address(image, &stencil_surface->memory_range);
5868 
5869       info.stencil_surf = &stencil_surface->isl;
5870 
5871       info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
5872       info.stencil_address =
5873          anv_batch_emit_reloc(&cmd_buffer->batch,
5874                               dw + device->isl_dev.ds.stencil_offset / 4,
5875                               stencil_address.bo, stencil_address.offset);
5876       info.mocs =
5877          anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
5878    }
5879 
5880    isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
5881 
5882    if (info.depth_surf)
5883       genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
5884 
5885    if (GFX_VER >= 12) {
5886       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5887       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5888 
5889       /* Wa_1408224581
5890        *
5891        * Workaround: Gfx12LP Astep only An additional pipe control with
5892        * post-sync = store dword operation would be required.( w/a is to
5893        * have an additional pipe control after the stencil state whenever
5894        * the surface state bits of this state is changing).
5895        */
5896       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5897          pc.PostSyncOperation = WriteImmediateData;
5898          pc.Address = cmd_buffer->device->workaround_address;
5899       }
5900    }
5901    cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
5902 }
5903 
5904 /**
5905  * This ANDs the view mask of the current subpass with the pending clear
5906  * views in the attachment to get the mask of views active in the subpass
5907  * that still need to be cleared.
5908  */
5909 static inline uint32_t
get_multiview_subpass_clear_mask(const struct anv_cmd_state * cmd_state,const struct anv_attachment_state * att_state)5910 get_multiview_subpass_clear_mask(const struct anv_cmd_state *cmd_state,
5911                                  const struct anv_attachment_state *att_state)
5912 {
5913    return cmd_state->subpass->view_mask & att_state->pending_clear_views;
5914 }
5915 
5916 static inline bool
do_first_layer_clear(const struct anv_cmd_state * cmd_state,const struct anv_attachment_state * att_state)5917 do_first_layer_clear(const struct anv_cmd_state *cmd_state,
5918                      const struct anv_attachment_state *att_state)
5919 {
5920    if (!cmd_state->subpass->view_mask)
5921       return true;
5922 
5923    uint32_t pending_clear_mask =
5924       get_multiview_subpass_clear_mask(cmd_state, att_state);
5925 
5926    return pending_clear_mask & 1;
5927 }
5928 
5929 static inline bool
current_subpass_is_last_for_attachment(const struct anv_cmd_state * cmd_state,uint32_t att_idx)5930 current_subpass_is_last_for_attachment(const struct anv_cmd_state *cmd_state,
5931                                        uint32_t att_idx)
5932 {
5933    const uint32_t last_subpass_idx =
5934       cmd_state->pass->attachments[att_idx].last_subpass_idx;
5935    const struct anv_subpass *last_subpass =
5936       &cmd_state->pass->subpasses[last_subpass_idx];
5937    return last_subpass == cmd_state->subpass;
5938 }
5939 
5940 static void
cmd_buffer_begin_subpass(struct anv_cmd_buffer * cmd_buffer,uint32_t subpass_id)5941 cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer,
5942                          uint32_t subpass_id)
5943 {
5944    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5945    struct anv_render_pass *pass = cmd_state->pass;
5946    struct anv_subpass *subpass = &pass->subpasses[subpass_id];
5947    cmd_state->subpass = subpass;
5948 
5949    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
5950 
5951    /* Our implementation of VK_KHR_multiview uses instancing to draw the
5952     * different views.  If the client asks for instancing, we need to use the
5953     * Instance Data Step Rate to ensure that we repeat the client's
5954     * per-instance data once for each view.  Since this bit is in
5955     * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
5956     * of each subpass.
5957     */
5958    if (GFX_VER == 7)
5959       cmd_buffer->state.gfx.vb_dirty |= ~0;
5960 
5961    /* It is possible to start a render pass with an old pipeline.  Because the
5962     * render pass and subpass index are both baked into the pipeline, this is
5963     * highly unlikely.  In order to do so, it requires that you have a render
5964     * pass with a single subpass and that you use that render pass twice
5965     * back-to-back and use the same pipeline at the start of the second render
5966     * pass as at the end of the first.  In order to avoid unpredictable issues
5967     * with this edge case, we just dirty the pipeline at the start of every
5968     * subpass.
5969     */
5970    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
5971 
5972    /* Accumulate any subpass flushes that need to happen before the subpass */
5973    anv_add_pending_pipe_bits(cmd_buffer,
5974                              cmd_buffer->state.pass->subpass_flushes[subpass_id],
5975                              "begin subpass deps/attachments");
5976 
5977    VkRect2D render_area = cmd_buffer->state.render_area;
5978    struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
5979 
5980    bool is_multiview = subpass->view_mask != 0;
5981 
5982    for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
5983       const uint32_t a = subpass->attachments[i].attachment;
5984       if (a == VK_ATTACHMENT_UNUSED)
5985          continue;
5986 
5987       assert(a < cmd_state->pass->attachment_count);
5988       struct anv_attachment_state *att_state = &cmd_state->attachments[a];
5989 
5990       struct anv_image_view *iview = cmd_state->attachments[a].image_view;
5991       const struct anv_image *image = iview->image;
5992 
5993       VkImageLayout target_layout = subpass->attachments[i].layout;
5994       VkImageLayout target_stencil_layout =
5995          subpass->attachments[i].stencil_layout;
5996 
5997       uint32_t level = iview->planes[0].isl.base_level;
5998       uint32_t width = anv_minify(iview->image->vk.extent.width, level);
5999       uint32_t height = anv_minify(iview->image->vk.extent.height, level);
6000       bool full_surface_draw =
6001          render_area.offset.x == 0 && render_area.offset.y == 0 &&
6002          render_area.extent.width == width &&
6003          render_area.extent.height == height;
6004 
6005       uint32_t base_layer, layer_count;
6006       if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
6007          base_layer = 0;
6008          layer_count = anv_minify(iview->image->vk.extent.depth, level);
6009       } else {
6010          base_layer = iview->planes[0].isl.base_array_layer;
6011          layer_count = fb->layers;
6012       }
6013 
6014       if (image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
6015          bool will_full_fast_clear =
6016             (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) &&
6017             att_state->fast_clear && full_surface_draw;
6018 
6019          assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
6020          transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
6021                                  level, 1, base_layer, layer_count,
6022                                  att_state->current_layout, target_layout,
6023                                  VK_QUEUE_FAMILY_IGNORED,
6024                                  VK_QUEUE_FAMILY_IGNORED,
6025                                  will_full_fast_clear);
6026          att_state->aux_usage =
6027             anv_layout_to_aux_usage(&cmd_buffer->device->info, image,
6028                                     VK_IMAGE_ASPECT_COLOR_BIT,
6029                                     VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
6030                                     target_layout);
6031       }
6032 
6033       if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
6034          bool will_full_fast_clear =
6035             (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
6036             att_state->fast_clear && full_surface_draw;
6037 
6038          transition_depth_buffer(cmd_buffer, image,
6039                                  base_layer, layer_count,
6040                                  att_state->current_layout, target_layout,
6041                                  will_full_fast_clear);
6042          att_state->aux_usage =
6043             anv_layout_to_aux_usage(&cmd_buffer->device->info, image,
6044                                     VK_IMAGE_ASPECT_DEPTH_BIT,
6045                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
6046                                     target_layout);
6047       }
6048 
6049       if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
6050          bool will_full_fast_clear =
6051             (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
6052             att_state->fast_clear && full_surface_draw;
6053 
6054          transition_stencil_buffer(cmd_buffer, image,
6055                                    level, 1, base_layer, layer_count,
6056                                    att_state->current_stencil_layout,
6057                                    target_stencil_layout,
6058                                    will_full_fast_clear);
6059       }
6060       att_state->current_layout = target_layout;
6061       att_state->current_stencil_layout = target_stencil_layout;
6062 
6063       if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
6064          assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT);
6065 
6066          /* Multi-planar images are not supported as attachments */
6067          assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
6068          assert(image->n_planes == 1);
6069 
6070          uint32_t base_clear_layer = iview->planes[0].isl.base_array_layer;
6071          uint32_t clear_layer_count = fb->layers;
6072 
6073          if (att_state->fast_clear &&
6074              do_first_layer_clear(cmd_state, att_state)) {
6075             /* We only support fast-clears on the first layer */
6076             assert(level == 0 && base_layer == 0);
6077 
6078             union isl_color_value clear_color = {};
6079             anv_clear_color_from_att_state(&clear_color, att_state, iview);
6080             if (iview->image->vk.samples == 1) {
6081                anv_image_ccs_op(cmd_buffer, image,
6082                                 iview->planes[0].isl.format,
6083                                 iview->planes[0].isl.swizzle,
6084                                 VK_IMAGE_ASPECT_COLOR_BIT,
6085                                 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
6086                                 &clear_color,
6087                                 false);
6088             } else {
6089                anv_image_mcs_op(cmd_buffer, image,
6090                                 iview->planes[0].isl.format,
6091                                 iview->planes[0].isl.swizzle,
6092                                 VK_IMAGE_ASPECT_COLOR_BIT,
6093                                 0, 1, ISL_AUX_OP_FAST_CLEAR,
6094                                 &clear_color,
6095                                 false);
6096             }
6097             base_clear_layer++;
6098             clear_layer_count--;
6099             if (is_multiview)
6100                att_state->pending_clear_views &= ~1;
6101 
6102             if (isl_color_value_is_zero(clear_color,
6103                                         iview->planes[0].isl.format)) {
6104                /* This image has the auxiliary buffer enabled. We can mark the
6105                 * subresource as not needing a resolve because the clear color
6106                 * will match what's in every RENDER_SURFACE_STATE object when
6107                 * it's being used for sampling.
6108                 */
6109                set_image_fast_clear_state(cmd_buffer, iview->image,
6110                                           VK_IMAGE_ASPECT_COLOR_BIT,
6111                                           ANV_FAST_CLEAR_DEFAULT_VALUE);
6112             } else {
6113                set_image_fast_clear_state(cmd_buffer, iview->image,
6114                                           VK_IMAGE_ASPECT_COLOR_BIT,
6115                                           ANV_FAST_CLEAR_ANY);
6116             }
6117          }
6118 
6119          /* From the VkFramebufferCreateInfo spec:
6120           *
6121           * "If the render pass uses multiview, then layers must be one and each
6122           *  attachment requires a number of layers that is greater than the
6123           *  maximum bit index set in the view mask in the subpasses in which it
6124           *  is used."
6125           *
6126           * So if multiview is active we ignore the number of layers in the
6127           * framebuffer and instead we honor the view mask from the subpass.
6128           */
6129          if (is_multiview) {
6130             assert(image->n_planes == 1);
6131             uint32_t pending_clear_mask =
6132                get_multiview_subpass_clear_mask(cmd_state, att_state);
6133 
6134             u_foreach_bit(layer_idx, pending_clear_mask) {
6135                uint32_t layer =
6136                   iview->planes[0].isl.base_array_layer + layer_idx;
6137 
6138                anv_image_clear_color(cmd_buffer, image,
6139                                      VK_IMAGE_ASPECT_COLOR_BIT,
6140                                      att_state->aux_usage,
6141                                      iview->planes[0].isl.format,
6142                                      iview->planes[0].isl.swizzle,
6143                                      level, layer, 1,
6144                                      render_area,
6145                                      vk_to_isl_color(att_state->clear_value.color));
6146             }
6147 
6148             att_state->pending_clear_views &= ~pending_clear_mask;
6149          } else if (clear_layer_count > 0) {
6150             assert(image->n_planes == 1);
6151             anv_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
6152                                   att_state->aux_usage,
6153                                   iview->planes[0].isl.format,
6154                                   iview->planes[0].isl.swizzle,
6155                                   level, base_clear_layer, clear_layer_count,
6156                                   render_area,
6157                                   vk_to_isl_color(att_state->clear_value.color));
6158          }
6159       } else if (att_state->pending_clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
6160                                                      VK_IMAGE_ASPECT_STENCIL_BIT)) {
6161          if (att_state->fast_clear &&
6162              (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
6163             /* We currently only support HiZ for single-LOD images */
6164             assert(isl_aux_usage_has_hiz(iview->image->planes[0].aux_usage));
6165             assert(iview->planes[0].isl.base_level == 0);
6166             assert(iview->planes[0].isl.levels == 1);
6167          }
6168 
6169          if (is_multiview) {
6170             uint32_t pending_clear_mask =
6171               get_multiview_subpass_clear_mask(cmd_state, att_state);
6172 
6173             u_foreach_bit(layer_idx, pending_clear_mask) {
6174                uint32_t layer =
6175                   iview->planes[0].isl.base_array_layer + layer_idx;
6176 
6177                if (att_state->fast_clear) {
6178                   anv_image_hiz_clear(cmd_buffer, image,
6179                                       att_state->pending_clear_aspects,
6180                                       level, layer, 1, render_area,
6181                                       att_state->clear_value.depthStencil.stencil);
6182                } else {
6183                   anv_image_clear_depth_stencil(cmd_buffer, image,
6184                                                 att_state->pending_clear_aspects,
6185                                                 att_state->aux_usage,
6186                                                 level, layer, 1, render_area,
6187                                                 att_state->clear_value.depthStencil.depth,
6188                                                 att_state->clear_value.depthStencil.stencil);
6189                }
6190             }
6191 
6192             att_state->pending_clear_views &= ~pending_clear_mask;
6193          } else {
6194             if (att_state->fast_clear) {
6195                anv_image_hiz_clear(cmd_buffer, image,
6196                                    att_state->pending_clear_aspects,
6197                                    level, base_layer, layer_count,
6198                                    render_area,
6199                                    att_state->clear_value.depthStencil.stencil);
6200             } else {
6201                anv_image_clear_depth_stencil(cmd_buffer, image,
6202                                              att_state->pending_clear_aspects,
6203                                              att_state->aux_usage,
6204                                              level, base_layer, layer_count,
6205                                              render_area,
6206                                              att_state->clear_value.depthStencil.depth,
6207                                              att_state->clear_value.depthStencil.stencil);
6208             }
6209          }
6210       } else  {
6211          assert(att_state->pending_clear_aspects == 0);
6212       }
6213 
6214       /* If multiview is enabled, then we are only done clearing when we no
6215        * longer have pending layers to clear, or when we have processed the
6216        * last subpass that uses this attachment.
6217        */
6218       if (!is_multiview ||
6219           att_state->pending_clear_views == 0 ||
6220           current_subpass_is_last_for_attachment(cmd_state, a)) {
6221          att_state->pending_clear_aspects = 0;
6222       }
6223 
6224       att_state->pending_load_aspects = 0;
6225    }
6226 
6227    /* We've transitioned all our images possibly fast clearing them.  Now we
6228     * can fill out the surface states that we will use as render targets
6229     * during actual subpass rendering.
6230     */
6231    VkResult result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer,
6232                                                             pass, subpass);
6233    if (result != VK_SUCCESS)
6234       return;
6235 
6236    isl_null_fill_state(&cmd_buffer->device->isl_dev,
6237                        cmd_state->null_surface_state.map,
6238                        .size = isl_extent3d(fb->width, fb->height, fb->layers));
6239 
6240    for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6241       const uint32_t att = subpass->attachments[i].attachment;
6242       if (att == VK_ATTACHMENT_UNUSED)
6243          continue;
6244 
6245       assert(att < cmd_state->pass->attachment_count);
6246       struct anv_render_pass_attachment *pass_att = &pass->attachments[att];
6247       struct anv_attachment_state *att_state = &cmd_state->attachments[att];
6248       struct anv_image_view *iview = att_state->image_view;
6249 
6250       if (!vk_format_is_color(pass_att->format))
6251          continue;
6252 
6253       const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;
6254       assert(util_bitcount(att_usage) == 1);
6255 
6256       struct anv_surface_state *surface_state;
6257       isl_surf_usage_flags_t isl_surf_usage;
6258       enum isl_aux_usage isl_aux_usage;
6259       if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
6260          surface_state = &att_state->color;
6261          isl_surf_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
6262          isl_aux_usage = att_state->aux_usage;
6263       } else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
6264          surface_state = &att_state->input;
6265          isl_surf_usage = ISL_SURF_USAGE_TEXTURE_BIT;
6266          isl_aux_usage =
6267             anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image,
6268                                     VK_IMAGE_ASPECT_COLOR_BIT,
6269                                     VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
6270                                     att_state->current_layout);
6271       } else {
6272          continue;
6273       }
6274 
6275       /* We had better have a surface state when we get here */
6276       assert(surface_state->state.map);
6277 
6278       union isl_color_value clear_color = { .u32 = { 0, } };
6279       if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6280           att_state->fast_clear)
6281          anv_clear_color_from_att_state(&clear_color, att_state, iview);
6282 
6283       anv_image_fill_surface_state(cmd_buffer->device,
6284                                    iview->image,
6285                                    VK_IMAGE_ASPECT_COLOR_BIT,
6286                                    &iview->planes[0].isl,
6287                                    isl_surf_usage,
6288                                    isl_aux_usage,
6289                                    &clear_color,
6290                                    0,
6291                                    surface_state,
6292                                    NULL);
6293 
6294       add_surface_state_relocs(cmd_buffer, *surface_state);
6295 
6296       if (GFX_VER < 10 &&
6297           pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD &&
6298           iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
6299           iview->planes[0].isl.base_level == 0 &&
6300           iview->planes[0].isl.base_array_layer == 0) {
6301          genX(copy_fast_clear_dwords)(cmd_buffer, surface_state->state,
6302                                       iview->image,
6303                                       VK_IMAGE_ASPECT_COLOR_BIT,
6304                                       false /* copy to ss */);
6305       }
6306    }
6307 
6308 #if GFX_VER >= 11
6309    /* The PIPE_CONTROL command description says:
6310     *
6311     *    "Whenever a Binding Table Index (BTI) used by a Render Taget Message
6312     *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
6313     *     Target Cache Flush by enabling this bit. When render target flush
6314     *     is set due to new association of BTI, PS Scoreboard Stall bit must
6315     *     be set in this packet."
6316     */
6317    anv_add_pending_pipe_bits(cmd_buffer,
6318                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
6319                              ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
6320                              "change RT");
6321 #endif
6322 
6323    cmd_buffer_emit_depth_stencil(cmd_buffer);
6324 }
6325 
6326 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBitsKHR vk_mode)6327 vk_to_blorp_resolve_mode(VkResolveModeFlagBitsKHR vk_mode)
6328 {
6329    switch (vk_mode) {
6330    case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR:
6331       return BLORP_FILTER_SAMPLE_0;
6332    case VK_RESOLVE_MODE_AVERAGE_BIT_KHR:
6333       return BLORP_FILTER_AVERAGE;
6334    case VK_RESOLVE_MODE_MIN_BIT_KHR:
6335       return BLORP_FILTER_MIN_SAMPLE;
6336    case VK_RESOLVE_MODE_MAX_BIT_KHR:
6337       return BLORP_FILTER_MAX_SAMPLE;
6338    default:
6339       return BLORP_FILTER_NONE;
6340    }
6341 }
6342 
6343 static void
cmd_buffer_end_subpass(struct anv_cmd_buffer * cmd_buffer)6344 cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer)
6345 {
6346    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
6347    struct anv_subpass *subpass = cmd_state->subpass;
6348    uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state);
6349    struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
6350 
6351    /* We are done with the previous subpass and all rendering directly to that
6352     * subpass is now complete.  Zero out all the surface states so we don't
6353     * accidentally use them between now and the next subpass.
6354     */
6355    for (uint32_t i = 0; i < cmd_state->pass->attachment_count; ++i) {
6356       memset(&cmd_state->attachments[i].color, 0,
6357              sizeof(cmd_state->attachments[i].color));
6358       memset(&cmd_state->attachments[i].input, 0,
6359              sizeof(cmd_state->attachments[i].input));
6360    }
6361    cmd_state->null_surface_state = ANV_STATE_NULL;
6362    cmd_state->attachment_states = ANV_STATE_NULL;
6363 
6364    for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6365       const uint32_t a = subpass->attachments[i].attachment;
6366       if (a == VK_ATTACHMENT_UNUSED)
6367          continue;
6368 
6369       assert(a < cmd_state->pass->attachment_count);
6370       struct anv_attachment_state *att_state = &cmd_state->attachments[a];
6371       struct anv_image_view *iview = att_state->image_view;
6372 
6373       assert(util_bitcount(subpass->attachments[i].usage) == 1);
6374       if (subpass->attachments[i].usage ==
6375           VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
6376          /* We assume that if we're ending a subpass, we did do some rendering
6377           * so we may end up with compressed data.
6378           */
6379          genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6380                                              VK_IMAGE_ASPECT_COLOR_BIT,
6381                                              att_state->aux_usage,
6382                                              iview->planes[0].isl.base_level,
6383                                              iview->planes[0].isl.base_array_layer,
6384                                              fb->layers);
6385       } else if (subpass->attachments[i].usage ==
6386                  VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
6387          /* We may be writing depth or stencil so we need to mark the surface.
6388           * Unfortunately, there's no way to know at this point whether the
6389           * depth or stencil tests used will actually write to the surface.
6390           *
6391           * Even though stencil may be plane 1, it always shares a base_level
6392           * with depth.
6393           */
6394          const struct isl_view *ds_view = &iview->planes[0].isl;
6395          if (iview->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
6396             genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6397                                                 VK_IMAGE_ASPECT_DEPTH_BIT,
6398                                                 att_state->aux_usage,
6399                                                 ds_view->base_level,
6400                                                 ds_view->base_array_layer,
6401                                                 fb->layers);
6402          }
6403          if (iview->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
6404             /* Even though stencil may be plane 1, it always shares a
6405              * base_level with depth.
6406              */
6407             genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6408                                                 VK_IMAGE_ASPECT_STENCIL_BIT,
6409                                                 ISL_AUX_USAGE_NONE,
6410                                                 ds_view->base_level,
6411                                                 ds_view->base_array_layer,
6412                                                 fb->layers);
6413          }
6414       }
6415    }
6416 
6417    if (subpass->has_color_resolve) {
6418       /* We are about to do some MSAA resolves.  We need to flush so that the
6419        * result of writes to the MSAA color attachments show up in the sampler
6420        * when we blit to the single-sampled resolve target.
6421        */
6422       anv_add_pending_pipe_bits(cmd_buffer,
6423                                 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
6424                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
6425                                 "MSAA resolve");
6426 
6427       for (uint32_t i = 0; i < subpass->color_count; ++i) {
6428          uint32_t src_att = subpass->color_attachments[i].attachment;
6429          uint32_t dst_att = subpass->resolve_attachments[i].attachment;
6430 
6431          if (dst_att == VK_ATTACHMENT_UNUSED)
6432             continue;
6433 
6434          assert(src_att < cmd_buffer->state.pass->attachment_count);
6435          assert(dst_att < cmd_buffer->state.pass->attachment_count);
6436 
6437          if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) {
6438             /* From the Vulkan 1.0 spec:
6439              *
6440              *    If the first use of an attachment in a render pass is as a
6441              *    resolve attachment, then the loadOp is effectively ignored
6442              *    as the resolve is guaranteed to overwrite all pixels in the
6443              *    render area.
6444              */
6445             cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0;
6446          }
6447 
6448          struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view;
6449          struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view;
6450 
6451          const VkRect2D render_area = cmd_buffer->state.render_area;
6452 
6453          enum isl_aux_usage src_aux_usage =
6454             cmd_buffer->state.attachments[src_att].aux_usage;
6455          enum isl_aux_usage dst_aux_usage =
6456             cmd_buffer->state.attachments[dst_att].aux_usage;
6457 
6458          assert(src_iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT &&
6459                 dst_iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
6460 
6461          anv_image_msaa_resolve(cmd_buffer,
6462                                 src_iview->image, src_aux_usage,
6463                                 src_iview->planes[0].isl.base_level,
6464                                 src_iview->planes[0].isl.base_array_layer,
6465                                 dst_iview->image, dst_aux_usage,
6466                                 dst_iview->planes[0].isl.base_level,
6467                                 dst_iview->planes[0].isl.base_array_layer,
6468                                 VK_IMAGE_ASPECT_COLOR_BIT,
6469                                 render_area.offset.x, render_area.offset.y,
6470                                 render_area.offset.x, render_area.offset.y,
6471                                 render_area.extent.width,
6472                                 render_area.extent.height,
6473                                 fb->layers, BLORP_FILTER_NONE);
6474       }
6475    }
6476 
6477    if (subpass->ds_resolve_attachment) {
6478       /* We are about to do some MSAA resolves.  We need to flush so that the
6479        * result of writes to the MSAA depth attachments show up in the sampler
6480        * when we blit to the single-sampled resolve target.
6481        */
6482       anv_add_pending_pipe_bits(cmd_buffer,
6483                               ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
6484                               ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
6485                               "MSAA resolve");
6486 
6487       uint32_t src_att = subpass->depth_stencil_attachment->attachment;
6488       uint32_t dst_att = subpass->ds_resolve_attachment->attachment;
6489 
6490       assert(src_att < cmd_buffer->state.pass->attachment_count);
6491       assert(dst_att < cmd_buffer->state.pass->attachment_count);
6492 
6493       if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) {
6494          /* From the Vulkan 1.0 spec:
6495           *
6496           *    If the first use of an attachment in a render pass is as a
6497           *    resolve attachment, then the loadOp is effectively ignored
6498           *    as the resolve is guaranteed to overwrite all pixels in the
6499           *    render area.
6500           */
6501          cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0;
6502       }
6503 
6504       struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view;
6505       struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view;
6506 
6507       const VkRect2D render_area = cmd_buffer->state.render_area;
6508 
6509       struct anv_attachment_state *src_state =
6510          &cmd_state->attachments[src_att];
6511       struct anv_attachment_state *dst_state =
6512          &cmd_state->attachments[dst_att];
6513 
6514       if ((src_iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
6515           subpass->depth_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) {
6516 
6517          /* MSAA resolves sample from the source attachment.  Transition the
6518           * depth attachment first to get rid of any HiZ that we may not be
6519           * able to handle.
6520           */
6521          transition_depth_buffer(cmd_buffer, src_iview->image,
6522                                  src_iview->planes[0].isl.base_array_layer,
6523                                  fb->layers,
6524                                  src_state->current_layout,
6525                                  VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
6526                                  false /* will_full_fast_clear */);
6527          src_state->aux_usage =
6528             anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image,
6529                                     VK_IMAGE_ASPECT_DEPTH_BIT,
6530                                     VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
6531                                     VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
6532          src_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
6533 
6534          /* MSAA resolves write to the resolve attachment as if it were any
6535           * other transfer op.  Transition the resolve attachment accordingly.
6536           */
6537          VkImageLayout dst_initial_layout = dst_state->current_layout;
6538 
6539          /* If our render area is the entire size of the image, we're going to
6540           * blow it all away so we can claim the initial layout is UNDEFINED
6541           * and we'll get a HiZ ambiguate instead of a resolve.
6542           */
6543          if (dst_iview->image->vk.image_type != VK_IMAGE_TYPE_3D &&
6544              render_area.offset.x == 0 && render_area.offset.y == 0 &&
6545              render_area.extent.width == dst_iview->vk.extent.width &&
6546              render_area.extent.height == dst_iview->vk.extent.height)
6547             dst_initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6548 
6549          transition_depth_buffer(cmd_buffer, dst_iview->image,
6550                                  dst_iview->planes[0].isl.base_array_layer,
6551                                  fb->layers,
6552                                  dst_initial_layout,
6553                                  VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
6554                                  false /* will_full_fast_clear */);
6555          dst_state->aux_usage =
6556             anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image,
6557                                     VK_IMAGE_ASPECT_DEPTH_BIT,
6558                                     VK_IMAGE_USAGE_TRANSFER_DST_BIT,
6559                                     VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
6560          dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
6561 
6562          enum blorp_filter filter =
6563             vk_to_blorp_resolve_mode(subpass->depth_resolve_mode);
6564 
6565          anv_image_msaa_resolve(cmd_buffer,
6566                                 src_iview->image, src_state->aux_usage,
6567                                 src_iview->planes[0].isl.base_level,
6568                                 src_iview->planes[0].isl.base_array_layer,
6569                                 dst_iview->image, dst_state->aux_usage,
6570                                 dst_iview->planes[0].isl.base_level,
6571                                 dst_iview->planes[0].isl.base_array_layer,
6572                                 VK_IMAGE_ASPECT_DEPTH_BIT,
6573                                 render_area.offset.x, render_area.offset.y,
6574                                 render_area.offset.x, render_area.offset.y,
6575                                 render_area.extent.width,
6576                                 render_area.extent.height,
6577                                 fb->layers, filter);
6578       }
6579 
6580       if ((src_iview->image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
6581           subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) {
6582 
6583          src_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
6584          dst_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
6585 
6586          enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE;
6587          const uint32_t plane =
6588             anv_image_aspect_to_plane(dst_iview->image, VK_IMAGE_ASPECT_STENCIL_BIT);
6589          enum isl_aux_usage dst_aux_usage =
6590             dst_iview->image->planes[plane].aux_usage;
6591 
6592          enum blorp_filter filter =
6593             vk_to_blorp_resolve_mode(subpass->stencil_resolve_mode);
6594 
6595          anv_image_msaa_resolve(cmd_buffer,
6596                                 src_iview->image, src_aux_usage,
6597                                 src_iview->planes[0].isl.base_level,
6598                                 src_iview->planes[0].isl.base_array_layer,
6599                                 dst_iview->image, dst_aux_usage,
6600                                 dst_iview->planes[0].isl.base_level,
6601                                 dst_iview->planes[0].isl.base_array_layer,
6602                                 VK_IMAGE_ASPECT_STENCIL_BIT,
6603                                 render_area.offset.x, render_area.offset.y,
6604                                 render_area.offset.x, render_area.offset.y,
6605                                 render_area.extent.width,
6606                                 render_area.extent.height,
6607                                 fb->layers, filter);
6608       }
6609    }
6610 
6611 #if GFX_VER == 7
6612    /* On gfx7, we have to store a texturable version of the stencil buffer in
6613     * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
6614     * forth at strategic points. Stencil writes are only allowed in following
6615     * layouts:
6616     *
6617     *  - VK_IMAGE_LAYOUT_GENERAL
6618     *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
6619     *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
6620     *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
6621     *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR
6622     *
6623     * For general, we have no nice opportunity to transition so we do the copy
6624     * to the shadow unconditionally at the end of the subpass. For transfer
6625     * destinations, we can update it as part of the transfer op. For the other
6626     * layouts, we delay the copy until a transition into some other layout.
6627     */
6628    if (subpass->depth_stencil_attachment) {
6629       uint32_t a = subpass->depth_stencil_attachment->attachment;
6630       assert(a != VK_ATTACHMENT_UNUSED);
6631 
6632       struct anv_attachment_state *att_state = &cmd_state->attachments[a];
6633       struct anv_image_view *iview = cmd_state->attachments[a].image_view;;
6634       const struct anv_image *image = iview->image;
6635 
6636       if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
6637          const uint32_t plane =
6638             anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
6639 
6640          if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
6641              att_state->current_stencil_layout == VK_IMAGE_LAYOUT_GENERAL) {
6642             assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
6643             anv_image_copy_to_shadow(cmd_buffer, image,
6644                                      VK_IMAGE_ASPECT_STENCIL_BIT,
6645                                      iview->planes[plane].isl.base_level, 1,
6646                                      iview->planes[plane].isl.base_array_layer,
6647                                      fb->layers);
6648          }
6649       }
6650    }
6651 #endif /* GFX_VER == 7 */
6652 
6653    for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6654       const uint32_t a = subpass->attachments[i].attachment;
6655       if (a == VK_ATTACHMENT_UNUSED)
6656          continue;
6657 
6658       if (cmd_state->pass->attachments[a].last_subpass_idx != subpass_id)
6659          continue;
6660 
6661       assert(a < cmd_state->pass->attachment_count);
6662       struct anv_attachment_state *att_state = &cmd_state->attachments[a];
6663       struct anv_image_view *iview = cmd_state->attachments[a].image_view;
6664       const struct anv_image *image = iview->image;
6665 
6666       /* Transition the image into the final layout for this render pass */
6667       VkImageLayout target_layout =
6668          cmd_state->pass->attachments[a].final_layout;
6669       VkImageLayout target_stencil_layout =
6670          cmd_state->pass->attachments[a].stencil_final_layout;
6671 
6672       uint32_t base_layer, layer_count;
6673       if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
6674          base_layer = 0;
6675          layer_count = anv_minify(iview->image->vk.extent.depth,
6676                                   iview->planes[0].isl.base_level);
6677       } else {
6678          base_layer = iview->planes[0].isl.base_array_layer;
6679          layer_count = fb->layers;
6680       }
6681 
6682       if (image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
6683          assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
6684          transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
6685                                  iview->planes[0].isl.base_level, 1,
6686                                  base_layer, layer_count,
6687                                  att_state->current_layout, target_layout,
6688                                  VK_QUEUE_FAMILY_IGNORED,
6689                                  VK_QUEUE_FAMILY_IGNORED,
6690                                  false /* will_full_fast_clear */);
6691       }
6692 
6693       if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
6694          transition_depth_buffer(cmd_buffer, image,
6695                                  base_layer, layer_count,
6696                                  att_state->current_layout, target_layout,
6697                                  false /* will_full_fast_clear */);
6698       }
6699 
6700       if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
6701          transition_stencil_buffer(cmd_buffer, image,
6702                                    iview->planes[0].isl.base_level, 1,
6703                                    base_layer, layer_count,
6704                                    att_state->current_stencil_layout,
6705                                    target_stencil_layout,
6706                                    false /* will_full_fast_clear */);
6707       }
6708    }
6709 
6710    /* Accumulate any subpass flushes that need to happen after the subpass.
6711     * Yes, they do get accumulated twice in the NextSubpass case but since
6712     * genX_CmdNextSubpass just calls end/begin back-to-back, we just end up
6713     * ORing the bits in twice so it's harmless.
6714     */
6715    anv_add_pending_pipe_bits(cmd_buffer,
6716                              cmd_buffer->state.pass->subpass_flushes[subpass_id + 1],
6717                              "end subpass deps/attachments");
6718 }
6719 
genX(CmdBeginRenderPass2)6720 void genX(CmdBeginRenderPass2)(
6721     VkCommandBuffer                             commandBuffer,
6722     const VkRenderPassBeginInfo*                pRenderPassBeginInfo,
6723     const VkSubpassBeginInfoKHR*                pSubpassBeginInfo)
6724 {
6725    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6726    ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBeginInfo->renderPass);
6727    ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer);
6728    VkResult result;
6729 
6730    if (!is_render_queue_cmd_buffer(cmd_buffer)) {
6731       assert(!"Trying to start a render pass on non-render queue!");
6732       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
6733       return;
6734    }
6735 
6736    cmd_buffer->state.framebuffer = framebuffer;
6737    cmd_buffer->state.pass = pass;
6738    cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea;
6739 
6740    anv_measure_beginrenderpass(cmd_buffer);
6741 
6742    result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass,
6743                                                framebuffer,
6744                                                pRenderPassBeginInfo);
6745    if (result != VK_SUCCESS) {
6746       assert(anv_batch_has_error(&cmd_buffer->batch));
6747       return;
6748    }
6749 
6750    genX(flush_pipeline_select_3d)(cmd_buffer);
6751 
6752    cmd_buffer_begin_subpass(cmd_buffer, 0);
6753 }
6754 
genX(CmdNextSubpass2)6755 void genX(CmdNextSubpass2)(
6756     VkCommandBuffer                             commandBuffer,
6757     const VkSubpassBeginInfoKHR*                pSubpassBeginInfo,
6758     const VkSubpassEndInfoKHR*                  pSubpassEndInfo)
6759 {
6760    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6761 
6762    if (anv_batch_has_error(&cmd_buffer->batch))
6763       return;
6764 
6765    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6766 
6767    uint32_t prev_subpass = anv_get_subpass_id(&cmd_buffer->state);
6768    cmd_buffer_end_subpass(cmd_buffer);
6769    cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
6770 }
6771 
genX(CmdEndRenderPass2)6772 void genX(CmdEndRenderPass2)(
6773     VkCommandBuffer                             commandBuffer,
6774     const VkSubpassEndInfoKHR*                  pSubpassEndInfo)
6775 {
6776    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6777 
6778    if (anv_batch_has_error(&cmd_buffer->batch))
6779       return;
6780 
6781    cmd_buffer_end_subpass(cmd_buffer);
6782 
6783    cmd_buffer->state.hiz_enabled = false;
6784 
6785    /* Remove references to render pass specific state. This enables us to
6786     * detect whether or not we're in a renderpass.
6787     */
6788    cmd_buffer->state.framebuffer = NULL;
6789    cmd_buffer->state.pass = NULL;
6790    cmd_buffer->state.subpass = NULL;
6791 }
6792 
6793 void
genX(cmd_emit_conditional_render_predicate)6794 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
6795 {
6796 #if GFX_VERx10 >= 75
6797    struct mi_builder b;
6798    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
6799 
6800    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
6801                 mi_reg32(ANV_PREDICATE_RESULT_REG));
6802    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
6803 
6804    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
6805       mip.LoadOperation    = LOAD_LOADINV;
6806       mip.CombineOperation = COMBINE_SET;
6807       mip.CompareOperation = COMPARE_SRCS_EQUAL;
6808    }
6809 #endif
6810 }
6811 
6812 #if GFX_VERx10 >= 75
genX(CmdBeginConditionalRenderingEXT)6813 void genX(CmdBeginConditionalRenderingEXT)(
6814    VkCommandBuffer                             commandBuffer,
6815    const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
6816 {
6817    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6818    ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
6819    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
6820    struct anv_address value_address =
6821       anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
6822 
6823    const bool isInverted = pConditionalRenderingBegin->flags &
6824                            VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
6825 
6826    cmd_state->conditional_render_enabled = true;
6827 
6828    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6829 
6830    struct mi_builder b;
6831    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
6832 
6833    /* Section 19.4 of the Vulkan 1.1.85 spec says:
6834     *
6835     *    If the value of the predicate in buffer memory changes
6836     *    while conditional rendering is active, the rendering commands
6837     *    may be discarded in an implementation-dependent way.
6838     *    Some implementations may latch the value of the predicate
6839     *    upon beginning conditional rendering while others
6840     *    may read it before every rendering command.
6841     *
6842     * So it's perfectly fine to read a value from the buffer once.
6843     */
6844    struct mi_value value =  mi_mem32(value_address);
6845 
6846    /* Precompute predicate result, it is necessary to support secondary
6847     * command buffers since it is unknown if conditional rendering is
6848     * inverted when populating them.
6849     */
6850    mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
6851                 isInverted ? mi_uge(&b, mi_imm(0), value) :
6852                              mi_ult(&b, mi_imm(0), value));
6853 }
6854 
genX(CmdEndConditionalRenderingEXT)6855 void genX(CmdEndConditionalRenderingEXT)(
6856 	VkCommandBuffer                             commandBuffer)
6857 {
6858    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6859    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
6860 
6861    cmd_state->conditional_render_enabled = false;
6862 }
6863 #endif
6864 
6865 /* Set of stage bits for which are pipelined, i.e. they get queued
6866  * by the command streamer for later execution.
6867  */
6868 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
6869    ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | \
6870      VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT_KHR | \
6871      VK_PIPELINE_STAGE_2_HOST_BIT_KHR | \
6872      VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
6873 
genX(CmdSetEvent2KHR)6874 void genX(CmdSetEvent2KHR)(
6875     VkCommandBuffer                             commandBuffer,
6876     VkEvent                                     _event,
6877     const VkDependencyInfoKHR*                  pDependencyInfo)
6878 {
6879    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6880    ANV_FROM_HANDLE(anv_event, event, _event);
6881 
6882    VkPipelineStageFlags2KHR src_stages = 0;
6883 
6884    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
6885       src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
6886    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
6887       src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
6888    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
6889       src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
6890 
6891    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
6892    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6893 
6894    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
6895       if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
6896          pc.StallAtPixelScoreboard = true;
6897          pc.CommandStreamerStallEnable = true;
6898       }
6899 
6900       pc.DestinationAddressType  = DAT_PPGTT,
6901       pc.PostSyncOperation       = WriteImmediateData,
6902       pc.Address = (struct anv_address) {
6903          cmd_buffer->device->dynamic_state_pool.block_pool.bo,
6904          event->state.offset
6905       };
6906       pc.ImmediateData           = VK_EVENT_SET;
6907       anv_debug_dump_pc(pc);
6908    }
6909 }
6910 
genX(CmdResetEvent2KHR)6911 void genX(CmdResetEvent2KHR)(
6912     VkCommandBuffer                             commandBuffer,
6913     VkEvent                                     _event,
6914     VkPipelineStageFlags2KHR                    stageMask)
6915 {
6916    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6917    ANV_FROM_HANDLE(anv_event, event, _event);
6918 
6919    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
6920    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6921 
6922    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
6923       if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
6924          pc.StallAtPixelScoreboard = true;
6925          pc.CommandStreamerStallEnable = true;
6926       }
6927 
6928       pc.DestinationAddressType  = DAT_PPGTT;
6929       pc.PostSyncOperation       = WriteImmediateData;
6930       pc.Address = (struct anv_address) {
6931          cmd_buffer->device->dynamic_state_pool.block_pool.bo,
6932          event->state.offset
6933       };
6934       pc.ImmediateData           = VK_EVENT_RESET;
6935       anv_debug_dump_pc(pc);
6936    }
6937 }
6938 
genX(CmdWaitEvents2KHR)6939 void genX(CmdWaitEvents2KHR)(
6940     VkCommandBuffer                             commandBuffer,
6941     uint32_t                                    eventCount,
6942     const VkEvent*                              pEvents,
6943     const VkDependencyInfoKHR*                  pDependencyInfos)
6944 {
6945    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6946 
6947 #if GFX_VER >= 8
6948    for (uint32_t i = 0; i < eventCount; i++) {
6949       ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
6950 
6951       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
6952          sem.WaitMode            = PollingMode,
6953          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD,
6954          sem.SemaphoreDataDword  = VK_EVENT_SET,
6955          sem.SemaphoreAddress = (struct anv_address) {
6956             cmd_buffer->device->dynamic_state_pool.block_pool.bo,
6957             event->state.offset
6958          };
6959       }
6960    }
6961 #else
6962    anv_finishme("Implement events on gfx7");
6963 #endif
6964 
6965    cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
6966 }
6967 
genX(CmdSetPerformanceOverrideINTEL)6968 VkResult genX(CmdSetPerformanceOverrideINTEL)(
6969     VkCommandBuffer                             commandBuffer,
6970     const VkPerformanceOverrideInfoINTEL*       pOverrideInfo)
6971 {
6972    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6973 
6974    switch (pOverrideInfo->type) {
6975    case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
6976 #if GFX_VER >= 9
6977       anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
6978          csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
6979          csdm2.MediaInstructionDisable = pOverrideInfo->enable;
6980          csdm2._3DRenderingInstructionDisableMask = true;
6981          csdm2.MediaInstructionDisableMask = true;
6982       }
6983 #else
6984       anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
6985          instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
6986          instpm.MediaInstructionDisable = pOverrideInfo->enable;
6987          instpm._3DRenderingInstructionDisableMask = true;
6988          instpm.MediaInstructionDisableMask = true;
6989       }
6990 #endif
6991       break;
6992    }
6993 
6994    case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
6995       if (pOverrideInfo->enable) {
6996          /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
6997          anv_add_pending_pipe_bits(cmd_buffer,
6998                                    ANV_PIPE_FLUSH_BITS |
6999                                    ANV_PIPE_INVALIDATE_BITS,
7000                                    "perf counter isolation");
7001          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7002       }
7003       break;
7004 
7005    default:
7006       unreachable("Invalid override");
7007    }
7008 
7009    return VK_SUCCESS;
7010 }
7011 
genX(CmdSetPerformanceStreamMarkerINTEL)7012 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
7013     VkCommandBuffer                             commandBuffer,
7014     const VkPerformanceStreamMarkerInfoINTEL*   pMarkerInfo)
7015 {
7016    /* TODO: Waiting on the register to write, might depend on generation. */
7017 
7018    return VK_SUCCESS;
7019 }
7020 
genX(cmd_emit_timestamp)7021 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
7022                               struct anv_bo *bo,
7023                               uint32_t offset) {
7024    anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
7025       pc.CommandStreamerStallEnable = true;
7026       pc.PostSyncOperation       = WriteTimestamp;
7027       pc.Address = (struct anv_address) {bo, offset};
7028       anv_debug_dump_pc(pc);
7029    }
7030 }
7031