1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_format.h"
30 #include "vk_render_pass.h"
31 #include "vk_util.h"
32 #include "util/fast_idiv_by_const.h"
33
34 #include "common/intel_aux_map.h"
35 #include "common/intel_l3_config.h"
36 #include "genxml/gen_macros.h"
37 #include "genxml/genX_pack.h"
38 #include "genxml/gen_rt_pack.h"
39 #include "common/intel_guardband.h"
40
41 #include "nir/nir_xfb_info.h"
42
43 #include "ds/intel_tracepoints.h"
44
45 /* We reserve :
46 * - GPR 14 for secondary command buffer returns
47 * - GPR 15 for conditional rendering
48 */
49 #define MI_BUILDER_NUM_ALLOC_GPRS 14
50 #define __gen_get_batch_dwords anv_batch_emit_dwords
51 #define __gen_address_offset anv_address_add
52 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
53 #include "common/mi_builder.h"
54
55 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
56 uint32_t pipeline);
57
58 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)59 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
60 enum anv_pipe_bits bits = 0;
61 bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
62 bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
63 #if GFX_VERx10 >= 125
64 bits |= (pc->PSSStallSyncEnable) ? ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
65 #endif
66 #if GFX_VER >= 12
67 bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
68 bits |= (pc->HDCPipelineFlushEnable) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
69 #endif
70 bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
71 bits |= (pc->VFCacheInvalidationEnable) ? ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
72 bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
73 bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
74 bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
75 bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
76 bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
77 bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0;
78 bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0;
79 return bits;
80 }
81
82 #define anv_debug_dump_pc(pc) \
83 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
84 fputs("pc: emit PC=( ", stderr); \
85 anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
86 fprintf(stderr, ") reason: %s\n", __FUNCTION__); \
87 }
88
89 static bool
is_render_queue_cmd_buffer(const struct anv_cmd_buffer * cmd_buffer)90 is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)
91 {
92 struct anv_queue_family *queue_family = cmd_buffer->queue_family;
93 return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
94 }
95
96 void
genX(cmd_buffer_emit_state_base_address)97 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
98 {
99 struct anv_device *device = cmd_buffer->device;
100 uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
101
102 /* If we are emitting a new state base address we probably need to re-emit
103 * binding tables.
104 */
105 cmd_buffer->state.descriptors_dirty |= ~0;
106
107 #if GFX_VERx10 >= 125
108 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
109 pc.CommandStreamerStallEnable = true;
110 anv_debug_dump_pc(pc);
111 }
112 anv_batch_emit(
113 &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
114 btpa.BindingTablePoolBaseAddress =
115 anv_cmd_buffer_surface_base_address(cmd_buffer);
116 btpa.BindingTablePoolBufferSize = BINDING_TABLE_POOL_BLOCK_SIZE / 4096;
117 btpa.MOCS = mocs;
118 }
119 #else /* GFX_VERx10 < 125 */
120 /* Emit a render target cache flush.
121 *
122 * This isn't documented anywhere in the PRM. However, it seems to be
123 * necessary prior to changing the surface state base adress. Without
124 * this, we get GPU hangs when using multi-level command buffers which
125 * clear depth, reset state base address, and then go render stuff.
126 */
127 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
128 #if GFX_VER >= 12
129 pc.HDCPipelineFlushEnable = true;
130 #else
131 pc.DCFlushEnable = true;
132 #endif
133 pc.RenderTargetCacheFlushEnable = true;
134 pc.CommandStreamerStallEnable = true;
135 anv_debug_dump_pc(pc);
136 }
137
138 #if GFX_VERx10 == 120
139 /* Wa_1607854226:
140 *
141 * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
142 * mode by putting the pipeline temporarily in 3D mode.
143 */
144 uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
145 genX(flush_pipeline_select_3d)(cmd_buffer);
146 #endif
147
148 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
149 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
150 sba.GeneralStateMOCS = mocs;
151 sba.GeneralStateBaseAddressModifyEnable = true;
152
153 sba.StatelessDataPortAccessMOCS = mocs;
154
155 sba.SurfaceStateBaseAddress =
156 anv_cmd_buffer_surface_base_address(cmd_buffer);
157 sba.SurfaceStateMOCS = mocs;
158 sba.SurfaceStateBaseAddressModifyEnable = true;
159
160 sba.DynamicStateBaseAddress =
161 (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
162 sba.DynamicStateMOCS = mocs;
163 sba.DynamicStateBaseAddressModifyEnable = true;
164
165 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
166 sba.IndirectObjectMOCS = mocs;
167 sba.IndirectObjectBaseAddressModifyEnable = true;
168
169 sba.InstructionBaseAddress =
170 (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
171 sba.InstructionMOCS = mocs;
172 sba.InstructionBaseAddressModifyEnable = true;
173
174 # if (GFX_VER >= 8)
175 /* Broadwell requires that we specify a buffer size for a bunch of
176 * these fields. However, since we will be growing the BO's live, we
177 * just set them all to the maximum.
178 */
179 sba.GeneralStateBufferSize = 0xfffff;
180 sba.IndirectObjectBufferSize = 0xfffff;
181 if (anv_use_relocations(device->physical)) {
182 sba.DynamicStateBufferSize = 0xfffff;
183 sba.InstructionBufferSize = 0xfffff;
184 } else {
185 /* With softpin, we use fixed addresses so we actually know how big
186 * our base addresses are.
187 */
188 sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096;
189 sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096;
190 }
191 sba.GeneralStateBufferSizeModifyEnable = true;
192 sba.IndirectObjectBufferSizeModifyEnable = true;
193 sba.DynamicStateBufferSizeModifyEnable = true;
194 sba.InstructionBuffersizeModifyEnable = true;
195 # else
196 /* On gfx7, we have upper bounds instead. According to the docs,
197 * setting an upper bound of zero means that no bounds checking is
198 * performed so, in theory, we should be able to leave them zero.
199 * However, border color is broken and the GPU bounds-checks anyway.
200 * To avoid this and other potential problems, we may as well set it
201 * for everything.
202 */
203 sba.GeneralStateAccessUpperBound =
204 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
205 sba.GeneralStateAccessUpperBoundModifyEnable = true;
206 sba.DynamicStateAccessUpperBound =
207 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
208 sba.DynamicStateAccessUpperBoundModifyEnable = true;
209 sba.InstructionAccessUpperBound =
210 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
211 sba.InstructionAccessUpperBoundModifyEnable = true;
212 # endif
213 # if (GFX_VER >= 9)
214 sba.BindlessSurfaceStateBaseAddress =
215 (struct anv_address) { device->surface_state_pool.block_pool.bo, 0 };
216 sba.BindlessSurfaceStateSize = (1 << 20) - 1;
217 sba.BindlessSurfaceStateMOCS = mocs;
218 sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
219 # endif
220 # if (GFX_VER >= 10)
221 sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
222 sba.BindlessSamplerStateMOCS = mocs;
223 sba.BindlessSamplerStateBaseAddressModifyEnable = true;
224 sba.BindlessSamplerStateBufferSize = 0;
225 # endif
226 }
227
228 #if GFX_VERx10 == 120
229 /* Wa_1607854226:
230 *
231 * Put the pipeline back into its current mode.
232 */
233 if (gfx12_wa_pipeline != UINT32_MAX)
234 genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
235 #endif
236
237 #endif /* GFX_VERx10 < 125 */
238
239 /* After re-setting the surface state base address, we have to do some
240 * cache flusing so that the sampler engine will pick up the new
241 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
242 * Shared Function > 3D Sampler > State > State Caching (page 96):
243 *
244 * Coherency with system memory in the state cache, like the texture
245 * cache is handled partially by software. It is expected that the
246 * command stream or shader will issue Cache Flush operation or
247 * Cache_Flush sampler message to ensure that the L1 cache remains
248 * coherent with system memory.
249 *
250 * [...]
251 *
252 * Whenever the value of the Dynamic_State_Base_Addr,
253 * Surface_State_Base_Addr are altered, the L1 state cache must be
254 * invalidated to ensure the new surface or sampler state is fetched
255 * from system memory.
256 *
257 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
258 * which, according the PIPE_CONTROL instruction documentation in the
259 * Broadwell PRM:
260 *
261 * Setting this bit is independent of any other bit in this packet.
262 * This bit controls the invalidation of the L1 and L2 state caches
263 * at the top of the pipe i.e. at the parsing time.
264 *
265 * Unfortunately, experimentation seems to indicate that state cache
266 * invalidation through a PIPE_CONTROL does nothing whatsoever in
267 * regards to surface state and binding tables. In stead, it seems that
268 * invalidating the texture cache is what is actually needed.
269 *
270 * XXX: As far as we have been able to determine through
271 * experimentation, shows that flush the texture cache appears to be
272 * sufficient. The theory here is that all of the sampling/rendering
273 * units cache the binding table in the texture cache. However, we have
274 * yet to be able to actually confirm this.
275 *
276 * Wa_14013910100:
277 *
278 * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
279 * or program pipe control with Instruction cache invalidate post
280 * STATE_BASE_ADDRESS command"
281 */
282 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
283 pc.TextureCacheInvalidationEnable = true;
284 pc.ConstantCacheInvalidationEnable = true;
285 pc.StateCacheInvalidationEnable = true;
286 #if GFX_VERx10 == 125
287 pc.InstructionCacheInvalidateEnable = true;
288 #endif
289 anv_debug_dump_pc(pc);
290 }
291 }
292
293 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_state state,struct anv_address addr)294 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
295 struct anv_state state, struct anv_address addr)
296 {
297 VkResult result;
298
299 if (anv_use_relocations(cmd_buffer->device->physical)) {
300 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
301 result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
302 &cmd_buffer->vk.pool->alloc,
303 state.offset + isl_dev->ss.addr_offset,
304 addr.bo, addr.offset, NULL);
305 } else {
306 result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
307 &cmd_buffer->vk.pool->alloc,
308 addr.bo);
309 }
310
311 if (unlikely(result != VK_SUCCESS))
312 anv_batch_set_error(&cmd_buffer->batch, result);
313 }
314
315 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,struct anv_surface_state state)316 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
317 struct anv_surface_state state)
318 {
319 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
320
321 assert(!anv_address_is_null(state.address));
322 add_surface_reloc(cmd_buffer, state.state, state.address);
323
324 if (!anv_address_is_null(state.aux_address)) {
325 VkResult result =
326 anv_reloc_list_add(&cmd_buffer->surface_relocs,
327 &cmd_buffer->vk.pool->alloc,
328 state.state.offset + isl_dev->ss.aux_addr_offset,
329 state.aux_address.bo,
330 state.aux_address.offset,
331 NULL);
332 if (result != VK_SUCCESS)
333 anv_batch_set_error(&cmd_buffer->batch, result);
334 }
335
336 if (!anv_address_is_null(state.clear_address)) {
337 VkResult result =
338 anv_reloc_list_add(&cmd_buffer->surface_relocs,
339 &cmd_buffer->vk.pool->alloc,
340 state.state.offset +
341 isl_dev->ss.clear_color_state_offset,
342 state.clear_address.bo,
343 state.clear_address.offset,
344 NULL);
345 if (result != VK_SUCCESS)
346 anv_batch_set_error(&cmd_buffer->batch, result);
347 }
348 }
349
350 static bool
isl_color_value_requires_conversion(union isl_color_value color,const struct isl_surf * surf,const struct isl_view * view)351 isl_color_value_requires_conversion(union isl_color_value color,
352 const struct isl_surf *surf,
353 const struct isl_view *view)
354 {
355 if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
356 return false;
357
358 uint32_t surf_pack[4] = { 0, 0, 0, 0 };
359 isl_color_value_pack(&color, surf->format, surf_pack);
360
361 uint32_t view_pack[4] = { 0, 0, 0, 0 };
362 union isl_color_value swiz_color =
363 isl_color_value_swizzle_inv(color, view->swizzle);
364 isl_color_value_pack(&swiz_color, view->format, view_pack);
365
366 return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
367 }
368
369 static bool
anv_can_fast_clear_color_view(struct anv_device * device,struct anv_image_view * iview,VkImageLayout layout,union isl_color_value clear_color,uint32_t num_layers,VkRect2D render_area)370 anv_can_fast_clear_color_view(struct anv_device * device,
371 struct anv_image_view *iview,
372 VkImageLayout layout,
373 union isl_color_value clear_color,
374 uint32_t num_layers,
375 VkRect2D render_area)
376 {
377 if (iview->planes[0].isl.base_array_layer >=
378 anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
379 iview->planes[0].isl.base_level))
380 return false;
381
382 /* Start by getting the fast clear type. We use the first subpass
383 * layout here because we don't want to fast-clear if the first subpass
384 * to use the attachment can't handle fast-clears.
385 */
386 enum anv_fast_clear_type fast_clear_type =
387 anv_layout_to_fast_clear_type(&device->info, iview->image,
388 VK_IMAGE_ASPECT_COLOR_BIT,
389 layout);
390 switch (fast_clear_type) {
391 case ANV_FAST_CLEAR_NONE:
392 return false;
393 case ANV_FAST_CLEAR_DEFAULT_VALUE:
394 if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
395 return false;
396 break;
397 case ANV_FAST_CLEAR_ANY:
398 break;
399 }
400
401 /* Potentially, we could do partial fast-clears but doing so has crazy
402 * alignment restrictions. It's easier to just restrict to full size
403 * fast clears for now.
404 */
405 if (render_area.offset.x != 0 ||
406 render_area.offset.y != 0 ||
407 render_area.extent.width != iview->vk.extent.width ||
408 render_area.extent.height != iview->vk.extent.height)
409 return false;
410
411 /* On Broadwell and earlier, we can only handle 0/1 clear colors */
412 if (GFX_VER <= 8 &&
413 !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
414 return false;
415
416 /* If the clear color is one that would require non-trivial format
417 * conversion on resolve, we don't bother with the fast clear. This
418 * shouldn't be common as most clear colors are 0/1 and the most common
419 * format re-interpretation is for sRGB.
420 */
421 if (isl_color_value_requires_conversion(clear_color,
422 &iview->image->planes[0].primary_surface.isl,
423 &iview->planes[0].isl)) {
424 anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
425 "Cannot fast-clear to colors which would require "
426 "format conversion on resolve");
427 return false;
428 }
429
430 /* We only allow fast clears to the first slice of an image (level 0,
431 * layer 0) and only for the entire slice. This guarantees us that, at
432 * any given time, there is only one clear color on any given image at
433 * any given time. At the time of our testing (Jan 17, 2018), there
434 * were no known applications which would benefit from fast-clearing
435 * more than just the first slice.
436 */
437 if (iview->planes[0].isl.base_level > 0 ||
438 iview->planes[0].isl.base_array_layer > 0) {
439 anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
440 "Rendering with multi-lod or multi-layer framebuffer "
441 "with LOAD_OP_LOAD and baseMipLevel > 0 or "
442 "baseArrayLayer > 0. Not fast clearing.");
443 return false;
444 }
445
446 if (num_layers > 1) {
447 anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
448 "Rendering to a multi-layer framebuffer with "
449 "LOAD_OP_CLEAR. Only fast-clearing the first slice");
450 }
451
452 return true;
453 }
454
455 static bool
anv_can_hiz_clear_ds_view(struct anv_device * device,const struct anv_image_view * iview,VkImageLayout layout,VkImageAspectFlags clear_aspects,float depth_clear_value,VkRect2D render_area)456 anv_can_hiz_clear_ds_view(struct anv_device *device,
457 const struct anv_image_view *iview,
458 VkImageLayout layout,
459 VkImageAspectFlags clear_aspects,
460 float depth_clear_value,
461 VkRect2D render_area)
462 {
463 /* We don't do any HiZ or depth fast-clears on gfx7 yet */
464 if (GFX_VER == 7)
465 return false;
466
467 /* If we're just clearing stencil, we can always HiZ clear */
468 if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
469 return true;
470
471 /* We must have depth in order to have HiZ */
472 if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
473 return false;
474
475 const enum isl_aux_usage clear_aux_usage =
476 anv_layout_to_aux_usage(&device->info, iview->image,
477 VK_IMAGE_ASPECT_DEPTH_BIT,
478 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
479 layout);
480 if (!blorp_can_hiz_clear_depth(&device->info,
481 &iview->image->planes[0].primary_surface.isl,
482 clear_aux_usage,
483 iview->planes[0].isl.base_level,
484 iview->planes[0].isl.base_array_layer,
485 render_area.offset.x,
486 render_area.offset.y,
487 render_area.offset.x +
488 render_area.extent.width,
489 render_area.offset.y +
490 render_area.extent.height))
491 return false;
492
493 if (depth_clear_value != ANV_HZ_FC_VAL)
494 return false;
495
496 /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
497 * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
498 * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
499 */
500 if (GFX_VER == 8 && anv_can_sample_with_hiz(&device->info, iview->image))
501 return false;
502
503 /* If we got here, then we can fast clear */
504 return true;
505 }
506
507 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
508
509 #if GFX_VER == 12
510 static void
anv_image_init_aux_tt(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count)511 anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
512 const struct anv_image *image,
513 VkImageAspectFlagBits aspect,
514 uint32_t base_level, uint32_t level_count,
515 uint32_t base_layer, uint32_t layer_count)
516 {
517 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
518
519 const struct anv_surface *surface = &image->planes[plane].primary_surface;
520 uint64_t base_address =
521 anv_address_physical(anv_image_address(image, &surface->memory_range));
522
523 const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl;
524 uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf);
525
526 /* We're about to live-update the AUX-TT. We really don't want anyone else
527 * trying to read it while we're doing this. We could probably get away
528 * with not having this stall in some cases if we were really careful but
529 * it's better to play it safe. Full stall the GPU.
530 */
531 anv_add_pending_pipe_bits(cmd_buffer,
532 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
533 "before update AUX-TT");
534 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
535
536 struct mi_builder b;
537 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
538
539 for (uint32_t a = 0; a < layer_count; a++) {
540 const uint32_t layer = base_layer + a;
541
542 uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
543 for (uint32_t l = 0; l < level_count; l++) {
544 const uint32_t level = base_level + l;
545
546 uint32_t logical_array_layer, logical_z_offset_px;
547 if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
548 logical_array_layer = 0;
549
550 /* If the given miplevel does not have this layer, then any higher
551 * miplevels won't either because miplevels only get smaller the
552 * higher the LOD.
553 */
554 assert(layer < image->vk.extent.depth);
555 if (layer >= anv_minify(image->vk.extent.depth, level))
556 break;
557 logical_z_offset_px = layer;
558 } else {
559 assert(layer < image->vk.array_layers);
560 logical_array_layer = layer;
561 logical_z_offset_px = 0;
562 }
563
564 uint64_t slice_start_offset_B, slice_end_offset_B;
565 isl_surf_get_image_range_B_tile(isl_surf, level,
566 logical_array_layer,
567 logical_z_offset_px,
568 &slice_start_offset_B,
569 &slice_end_offset_B);
570
571 start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
572 end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
573 }
574
575 /* Aux operates 64K at a time */
576 start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
577 end_offset_B = align_u64(end_offset_B, 64 * 1024);
578
579 for (uint64_t offset = start_offset_B;
580 offset < end_offset_B; offset += 64 * 1024) {
581 uint64_t address = base_address + offset;
582
583 uint64_t aux_entry_addr64, *aux_entry_map;
584 aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
585 address, &aux_entry_addr64);
586
587 assert(!anv_use_relocations(cmd_buffer->device->physical));
588 struct anv_address aux_entry_address = {
589 .bo = NULL,
590 .offset = aux_entry_addr64,
591 };
592
593 const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
594 uint64_t new_aux_entry =
595 (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits;
596
597 if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))
598 new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT;
599
600 mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry));
601 }
602 }
603
604 anv_add_pending_pipe_bits(cmd_buffer,
605 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
606 "after update AUX-TT");
607 }
608 #endif /* GFX_VER == 12 */
609
610 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
611 * the initial layout is undefined, the HiZ buffer and depth buffer will
612 * represent the same data at the end of this operation.
613 */
614 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)615 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
616 const struct anv_image *image,
617 uint32_t base_layer, uint32_t layer_count,
618 VkImageLayout initial_layout,
619 VkImageLayout final_layout,
620 bool will_full_fast_clear)
621 {
622 const uint32_t depth_plane =
623 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
624 if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
625 return;
626
627 #if GFX_VER == 12
628 if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
629 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
630 cmd_buffer->device->physical->has_implicit_ccs &&
631 cmd_buffer->device->info.has_aux_map) {
632 anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
633 0, 1, base_layer, layer_count);
634 }
635 #endif
636
637 /* If will_full_fast_clear is set, the caller promises to fast-clear the
638 * largest portion of the specified range as it can. For depth images,
639 * that means the entire image because we don't support multi-LOD HiZ.
640 */
641 assert(image->planes[0].primary_surface.isl.levels == 1);
642 if (will_full_fast_clear)
643 return;
644
645 const enum isl_aux_state initial_state =
646 anv_layout_to_aux_state(&cmd_buffer->device->info, image,
647 VK_IMAGE_ASPECT_DEPTH_BIT,
648 initial_layout);
649 const enum isl_aux_state final_state =
650 anv_layout_to_aux_state(&cmd_buffer->device->info, image,
651 VK_IMAGE_ASPECT_DEPTH_BIT,
652 final_layout);
653
654 const bool initial_depth_valid =
655 isl_aux_state_has_valid_primary(initial_state);
656 const bool initial_hiz_valid =
657 isl_aux_state_has_valid_aux(initial_state);
658 const bool final_needs_depth =
659 isl_aux_state_has_valid_primary(final_state);
660 const bool final_needs_hiz =
661 isl_aux_state_has_valid_aux(final_state);
662
663 /* Getting into the pass-through state for Depth is tricky and involves
664 * both a resolve and an ambiguate. We don't handle that state right now
665 * as anv_layout_to_aux_state never returns it.
666 */
667 assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
668
669 if (final_needs_depth && !initial_depth_valid) {
670 assert(initial_hiz_valid);
671 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
672 0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
673 } else if (final_needs_hiz && !initial_hiz_valid) {
674 assert(initial_depth_valid);
675 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
676 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
677 }
678 }
679
680 #if GFX_VER == 7
681 static inline bool
vk_image_layout_stencil_write_optimal(VkImageLayout layout)682 vk_image_layout_stencil_write_optimal(VkImageLayout layout)
683 {
684 return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
685 layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
686 layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR;
687 }
688 #endif
689
690 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
691 * the initial layout is undefined, the HiZ buffer and depth buffer will
692 * represent the same data at the end of this operation.
693 */
694 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)695 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
696 const struct anv_image *image,
697 uint32_t base_level, uint32_t level_count,
698 uint32_t base_layer, uint32_t layer_count,
699 VkImageLayout initial_layout,
700 VkImageLayout final_layout,
701 bool will_full_fast_clear)
702 {
703 #if GFX_VER == 7
704 const uint32_t plane =
705 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
706
707 /* On gfx7, we have to store a texturable version of the stencil buffer in
708 * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
709 * forth at strategic points. Stencil writes are only allowed in following
710 * layouts:
711 *
712 * - VK_IMAGE_LAYOUT_GENERAL
713 * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
714 * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
715 * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
716 * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR
717 *
718 * For general, we have no nice opportunity to transition so we do the copy
719 * to the shadow unconditionally at the end of the subpass. For transfer
720 * destinations, we can update it as part of the transfer op. For the other
721 * layouts, we delay the copy until a transition into some other layout.
722 */
723 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
724 vk_image_layout_stencil_write_optimal(initial_layout) &&
725 !vk_image_layout_stencil_write_optimal(final_layout)) {
726 anv_image_copy_to_shadow(cmd_buffer, image,
727 VK_IMAGE_ASPECT_STENCIL_BIT,
728 base_level, level_count,
729 base_layer, layer_count);
730 }
731 #elif GFX_VER == 12
732 const uint32_t plane =
733 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
734 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
735 return;
736
737 if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
738 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
739 cmd_buffer->device->physical->has_implicit_ccs &&
740 cmd_buffer->device->info.has_aux_map) {
741 anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
742 base_level, level_count, base_layer, layer_count);
743
744 /* If will_full_fast_clear is set, the caller promises to fast-clear the
745 * largest portion of the specified range as it can.
746 */
747 if (will_full_fast_clear)
748 return;
749
750 for (uint32_t l = 0; l < level_count; l++) {
751 const uint32_t level = base_level + l;
752 const VkRect2D clear_rect = {
753 .offset.x = 0,
754 .offset.y = 0,
755 .extent.width = anv_minify(image->vk.extent.width, level),
756 .extent.height = anv_minify(image->vk.extent.height, level),
757 };
758
759 uint32_t aux_layers =
760 anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
761 uint32_t level_layer_count =
762 MIN2(layer_count, aux_layers - base_layer);
763
764 /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
765 * Enable:
766 *
767 * "When enabled, Stencil Buffer needs to be initialized via
768 * stencil clear (HZ_OP) before any renderpass."
769 */
770 anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
771 level, base_layer, level_layer_count,
772 clear_rect, 0 /* Stencil clear value */);
773 }
774 }
775 #endif
776 }
777
778 #define MI_PREDICATE_SRC0 0x2400
779 #define MI_PREDICATE_SRC1 0x2408
780 #define MI_PREDICATE_RESULT 0x2418
781
782 static void
set_image_compressed_bit(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t base_layer,uint32_t layer_count,bool compressed)783 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
784 const struct anv_image *image,
785 VkImageAspectFlagBits aspect,
786 uint32_t level,
787 uint32_t base_layer, uint32_t layer_count,
788 bool compressed)
789 {
790 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
791
792 /* We only have compression tracking for CCS_E */
793 if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E)
794 return;
795
796 for (uint32_t a = 0; a < layer_count; a++) {
797 uint32_t layer = base_layer + a;
798 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
799 sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device,
800 image, aspect,
801 level, layer);
802 sdi.ImmediateData = compressed ? UINT32_MAX : 0;
803 }
804 }
805 }
806
807 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)808 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
809 const struct anv_image *image,
810 VkImageAspectFlagBits aspect,
811 enum anv_fast_clear_type fast_clear)
812 {
813 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
814 sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
815 image, aspect);
816 sdi.ImmediateData = fast_clear;
817 }
818
819 /* Whenever we have fast-clear, we consider that slice to be compressed.
820 * This makes building predicates much easier.
821 */
822 if (fast_clear != ANV_FAST_CLEAR_NONE)
823 set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
824 }
825
826 /* This is only really practical on haswell and above because it requires
827 * MI math in order to get it correct.
828 */
829 #if GFX_VERx10 >= 75
830 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)831 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
832 const struct anv_image *image,
833 VkImageAspectFlagBits aspect,
834 uint32_t level, uint32_t array_layer,
835 enum isl_aux_op resolve_op,
836 enum anv_fast_clear_type fast_clear_supported)
837 {
838 struct mi_builder b;
839 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
840
841 const struct mi_value fast_clear_type =
842 mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
843 image, aspect));
844
845 if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
846 /* In this case, we're doing a full resolve which means we want the
847 * resolve to happen if any compression (including fast-clears) is
848 * present.
849 *
850 * In order to simplify the logic a bit, we make the assumption that,
851 * if the first slice has been fast-cleared, it is also marked as
852 * compressed. See also set_image_fast_clear_state.
853 */
854 const struct mi_value compression_state =
855 mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device,
856 image, aspect,
857 level, array_layer));
858 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
859 mi_store(&b, compression_state, mi_imm(0));
860
861 if (level == 0 && array_layer == 0) {
862 /* If the predicate is true, we want to write 0 to the fast clear type
863 * and, if it's false, leave it alone. We can do this by writing
864 *
865 * clear_type = clear_type & ~predicate;
866 */
867 struct mi_value new_fast_clear_type =
868 mi_iand(&b, fast_clear_type,
869 mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
870 mi_store(&b, fast_clear_type, new_fast_clear_type);
871 }
872 } else if (level == 0 && array_layer == 0) {
873 /* In this case, we are doing a partial resolve to get rid of fast-clear
874 * colors. We don't care about the compression state but we do care
875 * about how much fast clear is allowed by the final layout.
876 */
877 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
878 assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
879
880 /* We need to compute (fast_clear_supported < image->fast_clear) */
881 struct mi_value pred =
882 mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
883 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
884
885 /* If the predicate is true, we want to write 0 to the fast clear type
886 * and, if it's false, leave it alone. We can do this by writing
887 *
888 * clear_type = clear_type & ~predicate;
889 */
890 struct mi_value new_fast_clear_type =
891 mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
892 mi_store(&b, fast_clear_type, new_fast_clear_type);
893 } else {
894 /* In this case, we're trying to do a partial resolve on a slice that
895 * doesn't have clear color. There's nothing to do.
896 */
897 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
898 return;
899 }
900
901 /* Set src1 to 0 and use a != condition */
902 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
903
904 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
905 mip.LoadOperation = LOAD_LOADINV;
906 mip.CombineOperation = COMBINE_SET;
907 mip.CompareOperation = COMPARE_SRCS_EQUAL;
908 }
909 }
910 #endif /* GFX_VERx10 >= 75 */
911
912 #if GFX_VER <= 8
913 static void
anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)914 anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
915 const struct anv_image *image,
916 VkImageAspectFlagBits aspect,
917 uint32_t level, uint32_t array_layer,
918 enum isl_aux_op resolve_op,
919 enum anv_fast_clear_type fast_clear_supported)
920 {
921 struct mi_builder b;
922 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
923
924 struct mi_value fast_clear_type_mem =
925 mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
926 image, aspect));
927
928 /* This only works for partial resolves and only when the clear color is
929 * all or nothing. On the upside, this emits less command streamer code
930 * and works on Ivybridge and Bay Trail.
931 */
932 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
933 assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
934
935 /* We don't support fast clears on anything other than the first slice. */
936 if (level > 0 || array_layer > 0)
937 return;
938
939 /* On gfx8, we don't have a concept of default clear colors because we
940 * can't sample from CCS surfaces. It's enough to just load the fast clear
941 * state into the predicate register.
942 */
943 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
944 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
945 mi_store(&b, fast_clear_type_mem, mi_imm(0));
946
947 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
948 mip.LoadOperation = LOAD_LOADINV;
949 mip.CombineOperation = COMBINE_SET;
950 mip.CompareOperation = COMPARE_SRCS_EQUAL;
951 }
952 }
953 #endif /* GFX_VER <= 8 */
954
955 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)956 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
957 const struct anv_image *image,
958 enum isl_format format,
959 struct isl_swizzle swizzle,
960 VkImageAspectFlagBits aspect,
961 uint32_t level, uint32_t array_layer,
962 enum isl_aux_op resolve_op,
963 enum anv_fast_clear_type fast_clear_supported)
964 {
965 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
966
967 #if GFX_VER >= 9
968 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
969 aspect, level, array_layer,
970 resolve_op, fast_clear_supported);
971 #else /* GFX_VER <= 8 */
972 anv_cmd_simple_resolve_predicate(cmd_buffer, image,
973 aspect, level, array_layer,
974 resolve_op, fast_clear_supported);
975 #endif
976
977 /* CCS_D only supports full resolves and BLORP will assert on us if we try
978 * to do a partial resolve on a CCS_D surface.
979 */
980 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
981 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
982 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
983
984 anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
985 level, array_layer, 1, resolve_op, NULL, true);
986 }
987
988 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)989 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
990 const struct anv_image *image,
991 enum isl_format format,
992 struct isl_swizzle swizzle,
993 VkImageAspectFlagBits aspect,
994 uint32_t array_layer,
995 enum isl_aux_op resolve_op,
996 enum anv_fast_clear_type fast_clear_supported)
997 {
998 assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
999 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
1000
1001 #if GFX_VERx10 >= 75
1002 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
1003 aspect, 0, array_layer,
1004 resolve_op, fast_clear_supported);
1005
1006 anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
1007 array_layer, 1, resolve_op, NULL, true);
1008 #else
1009 unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
1010 #endif
1011 }
1012
1013 void
genX(cmd_buffer_mark_image_written)1014 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
1015 const struct anv_image *image,
1016 VkImageAspectFlagBits aspect,
1017 enum isl_aux_usage aux_usage,
1018 uint32_t level,
1019 uint32_t base_layer,
1020 uint32_t layer_count)
1021 {
1022 /* The aspect must be exactly one of the image aspects. */
1023 assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
1024
1025 /* The only compression types with more than just fast-clears are MCS,
1026 * CCS_E, and HiZ. With HiZ we just trust the layout and don't actually
1027 * track the current fast-clear and compression state. This leaves us
1028 * with just MCS and CCS_E.
1029 */
1030 if (aux_usage != ISL_AUX_USAGE_CCS_E &&
1031 aux_usage != ISL_AUX_USAGE_MCS)
1032 return;
1033
1034 set_image_compressed_bit(cmd_buffer, image, aspect,
1035 level, base_layer, layer_count, true);
1036 }
1037
1038 static void
init_fast_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect)1039 init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
1040 const struct anv_image *image,
1041 VkImageAspectFlagBits aspect)
1042 {
1043 assert(cmd_buffer && image);
1044 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1045
1046 set_image_fast_clear_state(cmd_buffer, image, aspect,
1047 ANV_FAST_CLEAR_NONE);
1048
1049 /* Initialize the struct fields that are accessed for fast-clears so that
1050 * the HW restrictions on the field values are satisfied.
1051 */
1052 struct anv_address addr =
1053 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
1054
1055 if (GFX_VER >= 9) {
1056 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1057 const unsigned num_dwords = GFX_VER >= 10 ?
1058 isl_dev->ss.clear_color_state_size / 4 :
1059 isl_dev->ss.clear_value_size / 4;
1060 for (unsigned i = 0; i < num_dwords; i++) {
1061 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
1062 sdi.Address = addr;
1063 sdi.Address.offset += i * 4;
1064 sdi.ImmediateData = 0;
1065 }
1066 }
1067 } else {
1068 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
1069 sdi.Address = addr;
1070 if (GFX_VERx10 >= 75) {
1071 /* Pre-SKL, the dword containing the clear values also contains
1072 * other fields, so we need to initialize those fields to match the
1073 * values that would be in a color attachment.
1074 */
1075 sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 |
1076 ISL_CHANNEL_SELECT_GREEN << 22 |
1077 ISL_CHANNEL_SELECT_BLUE << 19 |
1078 ISL_CHANNEL_SELECT_ALPHA << 16;
1079 } else if (GFX_VER == 7) {
1080 /* On IVB, the dword containing the clear values also contains
1081 * other fields that must be zero or can be zero.
1082 */
1083 sdi.ImmediateData = 0;
1084 }
1085 }
1086 }
1087 }
1088
1089 /* Copy the fast-clear value dword(s) between a surface state object and an
1090 * image's fast clear state buffer.
1091 */
1092 static void
genX(copy_fast_clear_dwords)1093 genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
1094 struct anv_state surface_state,
1095 const struct anv_image *image,
1096 VkImageAspectFlagBits aspect,
1097 bool copy_from_surface_state)
1098 {
1099 assert(cmd_buffer && image);
1100 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1101
1102 struct anv_address ss_clear_addr = {
1103 .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
1104 .offset = surface_state.offset +
1105 cmd_buffer->device->isl_dev.ss.clear_value_offset,
1106 };
1107 const struct anv_address entry_addr =
1108 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
1109 unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
1110
1111 #if GFX_VER == 7
1112 /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
1113 * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
1114 * in-flight when they are issued even if the memory touched is not
1115 * currently active for rendering. The weird bit is that it is not the
1116 * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
1117 * rendering hangs such that the next stalling command after the
1118 * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
1119 *
1120 * It is unclear exactly why this hang occurs. Both MI commands come with
1121 * warnings about the 3D pipeline but that doesn't seem to fully explain
1122 * it. My (Jason's) best theory is that it has something to do with the
1123 * fact that we're using a GPU state register as our temporary and that
1124 * something with reading/writing it is causing problems.
1125 *
1126 * In order to work around this issue, we emit a PIPE_CONTROL with the
1127 * command streamer stall bit set.
1128 */
1129 anv_add_pending_pipe_bits(cmd_buffer,
1130 ANV_PIPE_CS_STALL_BIT,
1131 "after copy_fast_clear_dwords. Avoid potential hang");
1132 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1133 #endif
1134
1135 struct mi_builder b;
1136 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1137
1138 if (copy_from_surface_state) {
1139 mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
1140 } else {
1141 mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
1142
1143 /* Updating a surface state object may require that the state cache be
1144 * invalidated. From the SKL PRM, Shared Functions -> State -> State
1145 * Caching:
1146 *
1147 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
1148 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
1149 * modified [...], the L1 state cache must be invalidated to ensure
1150 * the new surface or sampler state is fetched from system memory.
1151 *
1152 * In testing, SKL doesn't actually seem to need this, but HSW does.
1153 */
1154 anv_add_pending_pipe_bits(cmd_buffer,
1155 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
1156 "after copy_fast_clear_dwords surface state update");
1157 }
1158 }
1159
1160 /**
1161 * @brief Transitions a color buffer from one layout to another.
1162 *
1163 * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
1164 * more information.
1165 *
1166 * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
1167 * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
1168 * this represents the maximum layers to transition at each
1169 * specified miplevel.
1170 */
1171 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint64_t src_queue_family,uint64_t dst_queue_family,bool will_full_fast_clear)1172 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
1173 const struct anv_image *image,
1174 VkImageAspectFlagBits aspect,
1175 const uint32_t base_level, uint32_t level_count,
1176 uint32_t base_layer, uint32_t layer_count,
1177 VkImageLayout initial_layout,
1178 VkImageLayout final_layout,
1179 uint64_t src_queue_family,
1180 uint64_t dst_queue_family,
1181 bool will_full_fast_clear)
1182 {
1183 struct anv_device *device = cmd_buffer->device;
1184 const struct intel_device_info *devinfo = &device->info;
1185 /* Validate the inputs. */
1186 assert(cmd_buffer);
1187 assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1188 /* These values aren't supported for simplicity's sake. */
1189 assert(level_count != VK_REMAINING_MIP_LEVELS &&
1190 layer_count != VK_REMAINING_ARRAY_LAYERS);
1191 /* Ensure the subresource range is valid. */
1192 UNUSED uint64_t last_level_num = base_level + level_count;
1193 const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level);
1194 UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
1195 assert((uint64_t)base_layer + layer_count <= image_layers);
1196 assert(last_level_num <= image->vk.mip_levels);
1197 /* If there is a layout transfer, the final layout cannot be undefined or
1198 * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
1199 */
1200 assert(initial_layout == final_layout ||
1201 (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
1202 final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
1203 const struct isl_drm_modifier_info *isl_mod_info =
1204 image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
1205 ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
1206 : NULL;
1207
1208 const bool src_queue_external =
1209 src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1210 src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1211
1212 const bool dst_queue_external =
1213 dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1214 dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1215
1216 /* Simultaneous acquire and release on external queues is illegal. */
1217 assert(!src_queue_external || !dst_queue_external);
1218
1219 /* Ownership transition on an external queue requires special action if the
1220 * image has a DRM format modifier because we store image data in
1221 * a driver-private bo which is inaccessible to the external queue.
1222 */
1223 const bool private_binding_acquire =
1224 src_queue_external &&
1225 anv_image_is_externally_shared(image) &&
1226 anv_image_has_private_binding(image);
1227
1228 const bool private_binding_release =
1229 dst_queue_external &&
1230 anv_image_is_externally_shared(image) &&
1231 anv_image_has_private_binding(image);
1232
1233 if (initial_layout == final_layout &&
1234 !private_binding_acquire && !private_binding_release) {
1235 /* No work is needed. */
1236 return;
1237 }
1238
1239 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
1240
1241 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
1242 final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
1243 /* This surface is a linear compressed image with a tiled shadow surface
1244 * for texturing. The client is about to use it in READ_ONLY_OPTIMAL so
1245 * we need to ensure the shadow copy is up-to-date.
1246 */
1247 assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
1248 assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
1249 assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
1250 assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
1251 assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
1252 assert(plane == 0);
1253 anv_image_copy_to_shadow(cmd_buffer, image,
1254 VK_IMAGE_ASPECT_COLOR_BIT,
1255 base_level, level_count,
1256 base_layer, layer_count);
1257 }
1258
1259 if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
1260 return;
1261
1262 assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
1263
1264 /* The following layouts are equivalent for non-linear images. */
1265 const bool initial_layout_undefined =
1266 initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
1267 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
1268
1269 bool must_init_fast_clear_state = false;
1270 bool must_init_aux_surface = false;
1271
1272 if (initial_layout_undefined) {
1273 /* The subresource may have been aliased and populated with arbitrary
1274 * data.
1275 */
1276 must_init_fast_clear_state = true;
1277 must_init_aux_surface = true;
1278 } else if (private_binding_acquire) {
1279 /* The fast clear state lives in a driver-private bo, and therefore the
1280 * external/foreign queue is unaware of it.
1281 *
1282 * If this is the first time we are accessing the image, then the fast
1283 * clear state is uninitialized.
1284 *
1285 * If this is NOT the first time we are accessing the image, then the fast
1286 * clear state may still be valid and correct due to the resolve during
1287 * our most recent ownership release. However, we do not track the aux
1288 * state with MI stores, and therefore must assume the worst-case: that
1289 * this is the first time we are accessing the image.
1290 */
1291 assert(image->planes[plane].fast_clear_memory_range.binding ==
1292 ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1293 must_init_fast_clear_state = true;
1294
1295 if (image->planes[plane].aux_surface.memory_range.binding ==
1296 ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
1297 assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE);
1298
1299 /* The aux surface, like the fast clear state, lives in
1300 * a driver-private bo. We must initialize the aux surface for the
1301 * same reasons we must initialize the fast clear state.
1302 */
1303 must_init_aux_surface = true;
1304 } else {
1305 assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE);
1306
1307 /* The aux surface, unlike the fast clear state, lives in
1308 * application-visible VkDeviceMemory and is shared with the
1309 * external/foreign queue. Therefore, when we acquire ownership of the
1310 * image with a defined VkImageLayout, the aux surface is valid and has
1311 * the aux state required by the modifier.
1312 */
1313 must_init_aux_surface = false;
1314 }
1315 }
1316
1317 #if GFX_VER == 12
1318 if (initial_layout_undefined) {
1319 if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {
1320 anv_image_init_aux_tt(cmd_buffer, image, aspect,
1321 base_level, level_count,
1322 base_layer, layer_count);
1323 }
1324 }
1325 #else
1326 assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
1327 #endif
1328
1329 if (must_init_fast_clear_state) {
1330 if (base_level == 0 && base_layer == 0)
1331 init_fast_clear_color(cmd_buffer, image, aspect);
1332 }
1333
1334 if (must_init_aux_surface) {
1335 assert(must_init_fast_clear_state);
1336
1337 /* Initialize the aux buffers to enable correct rendering. In order to
1338 * ensure that things such as storage images work correctly, aux buffers
1339 * need to be initialized to valid data.
1340 *
1341 * Having an aux buffer with invalid data is a problem for two reasons:
1342 *
1343 * 1) Having an invalid value in the buffer can confuse the hardware.
1344 * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1345 * invalid and leads to the hardware doing strange things. It
1346 * doesn't hang as far as we can tell but rendering corruption can
1347 * occur.
1348 *
1349 * 2) If this transition is into the GENERAL layout and we then use the
1350 * image as a storage image, then we must have the aux buffer in the
1351 * pass-through state so that, if we then go to texture from the
1352 * image, we get the results of our storage image writes and not the
1353 * fast clear color or other random data.
1354 *
1355 * For CCS both of the problems above are real demonstrable issues. In
1356 * that case, the only thing we can do is to perform an ambiguate to
1357 * transition the aux surface into the pass-through state.
1358 *
1359 * For MCS, (2) is never an issue because we don't support multisampled
1360 * storage images. In theory, issue (1) is a problem with MCS but we've
1361 * never seen it in the wild. For 4x and 16x, all bit patters could, in
1362 * theory, be interpreted as something but we don't know that all bit
1363 * patterns are actually valid. For 2x and 8x, you could easily end up
1364 * with the MCS referring to an invalid plane because not all bits of
1365 * the MCS value are actually used. Even though we've never seen issues
1366 * in the wild, it's best to play it safe and initialize the MCS. We
1367 * can use a fast-clear for MCS because we only ever touch from render
1368 * and texture (no image load store).
1369 */
1370 if (image->vk.samples == 1) {
1371 for (uint32_t l = 0; l < level_count; l++) {
1372 const uint32_t level = base_level + l;
1373
1374 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1375 if (base_layer >= aux_layers)
1376 break; /* We will only get fewer layers as level increases */
1377 uint32_t level_layer_count =
1378 MIN2(layer_count, aux_layers - base_layer);
1379
1380 /* If will_full_fast_clear is set, the caller promises to
1381 * fast-clear the largest portion of the specified range as it can.
1382 * For color images, that means only the first LOD and array slice.
1383 */
1384 if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1385 base_layer++;
1386 level_layer_count--;
1387 if (level_layer_count == 0)
1388 continue;
1389 }
1390
1391 anv_image_ccs_op(cmd_buffer, image,
1392 image->planes[plane].primary_surface.isl.format,
1393 ISL_SWIZZLE_IDENTITY,
1394 aspect, level, base_layer, level_layer_count,
1395 ISL_AUX_OP_AMBIGUATE, NULL, false);
1396
1397 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
1398 set_image_compressed_bit(cmd_buffer, image, aspect,
1399 level, base_layer, level_layer_count,
1400 false);
1401 }
1402 }
1403 } else {
1404 if (image->vk.samples == 4 || image->vk.samples == 16) {
1405 anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
1406 "Doing a potentially unnecessary fast-clear to "
1407 "define an MCS buffer.");
1408 }
1409
1410 /* If will_full_fast_clear is set, the caller promises to fast-clear
1411 * the largest portion of the specified range as it can.
1412 */
1413 if (will_full_fast_clear)
1414 return;
1415
1416 assert(base_level == 0 && level_count == 1);
1417 anv_image_mcs_op(cmd_buffer, image,
1418 image->planes[plane].primary_surface.isl.format,
1419 ISL_SWIZZLE_IDENTITY,
1420 aspect, base_layer, layer_count,
1421 ISL_AUX_OP_FAST_CLEAR, NULL, false);
1422 }
1423 return;
1424 }
1425
1426 enum isl_aux_usage initial_aux_usage =
1427 anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
1428 enum isl_aux_usage final_aux_usage =
1429 anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
1430 enum anv_fast_clear_type initial_fast_clear =
1431 anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
1432 enum anv_fast_clear_type final_fast_clear =
1433 anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
1434
1435 /* We must override the anv_layout_to_* functions because they are unaware of
1436 * acquire/release direction.
1437 */
1438 if (private_binding_acquire) {
1439 initial_aux_usage = isl_mod_info->aux_usage;
1440 initial_fast_clear = isl_mod_info->supports_clear_color ?
1441 initial_fast_clear : ANV_FAST_CLEAR_NONE;
1442 } else if (private_binding_release) {
1443 final_aux_usage = isl_mod_info->aux_usage;
1444 final_fast_clear = isl_mod_info->supports_clear_color ?
1445 final_fast_clear : ANV_FAST_CLEAR_NONE;
1446 }
1447
1448 /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1449 * We can handle transitions between CCS_D/E to and from NONE. What we
1450 * don't yet handle is switching between CCS_E and CCS_D within a given
1451 * image. Doing so in a performant way requires more detailed aux state
1452 * tracking such as what is done in i965. For now, just assume that we
1453 * only have one type of compression.
1454 */
1455 assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1456 final_aux_usage == ISL_AUX_USAGE_NONE ||
1457 initial_aux_usage == final_aux_usage);
1458
1459 /* If initial aux usage is NONE, there is nothing to resolve */
1460 if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1461 return;
1462
1463 enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1464
1465 /* If the initial layout supports more fast clear than the final layout
1466 * then we need at least a partial resolve.
1467 */
1468 if (final_fast_clear < initial_fast_clear)
1469 resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1470
1471 if (initial_aux_usage == ISL_AUX_USAGE_CCS_E &&
1472 final_aux_usage != ISL_AUX_USAGE_CCS_E)
1473 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
1474
1475 if (resolve_op == ISL_AUX_OP_NONE)
1476 return;
1477
1478 /* Perform a resolve to synchronize data between the main and aux buffer.
1479 * Before we begin, we must satisfy the cache flushing requirement specified
1480 * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1481 *
1482 * Any transition from any value in {Clear, Render, Resolve} to a
1483 * different value in {Clear, Render, Resolve} requires end of pipe
1484 * synchronization.
1485 *
1486 * We perform a flush of the write cache before and after the clear and
1487 * resolve operations to meet this requirement.
1488 *
1489 * Unlike other drawing, fast clear operations are not properly
1490 * synchronized. The first PIPE_CONTROL here likely ensures that the
1491 * contents of the previous render or clear hit the render target before we
1492 * resolve and the second likely ensures that the resolve is complete before
1493 * we do any more rendering or clearing.
1494 */
1495 anv_add_pending_pipe_bits(cmd_buffer,
1496 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1497 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1498 "after transition RT");
1499
1500 for (uint32_t l = 0; l < level_count; l++) {
1501 uint32_t level = base_level + l;
1502
1503 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1504 if (base_layer >= aux_layers)
1505 break; /* We will only get fewer layers as level increases */
1506 uint32_t level_layer_count =
1507 MIN2(layer_count, aux_layers - base_layer);
1508
1509 for (uint32_t a = 0; a < level_layer_count; a++) {
1510 uint32_t array_layer = base_layer + a;
1511
1512 /* If will_full_fast_clear is set, the caller promises to fast-clear
1513 * the largest portion of the specified range as it can. For color
1514 * images, that means only the first LOD and array slice.
1515 */
1516 if (level == 0 && array_layer == 0 && will_full_fast_clear)
1517 continue;
1518
1519 if (image->vk.samples == 1) {
1520 anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1521 image->planes[plane].primary_surface.isl.format,
1522 ISL_SWIZZLE_IDENTITY,
1523 aspect, level, array_layer, resolve_op,
1524 final_fast_clear);
1525 } else {
1526 /* We only support fast-clear on the first layer so partial
1527 * resolves should not be used on other layers as they will use
1528 * the clear color stored in memory that is only valid for layer0.
1529 */
1530 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1531 array_layer != 0)
1532 continue;
1533
1534 anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1535 image->planes[plane].primary_surface.isl.format,
1536 ISL_SWIZZLE_IDENTITY,
1537 aspect, array_layer, resolve_op,
1538 final_fast_clear);
1539 }
1540 }
1541 }
1542
1543 anv_add_pending_pipe_bits(cmd_buffer,
1544 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1545 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1546 "after transition RT");
1547 }
1548
1549 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count,uint32_t color_att_valid)1550 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1551 uint32_t color_att_count,
1552 uint32_t color_att_valid)
1553 {
1554 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1555
1556 /* Reserve one for the NULL state. */
1557 unsigned num_states = 1 + util_bitcount(color_att_valid);
1558 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1559 const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
1560 gfx->att_states =
1561 anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
1562 num_states * ss_stride, isl_dev->ss.align);
1563 if (gfx->att_states.map == NULL) {
1564 return anv_batch_set_error(&cmd_buffer->batch,
1565 VK_ERROR_OUT_OF_DEVICE_MEMORY);
1566 }
1567
1568 struct anv_state next_state = gfx->att_states;
1569 next_state.alloc_size = isl_dev->ss.size;
1570
1571 gfx->null_surface_state = next_state;
1572 next_state.offset += ss_stride;
1573 next_state.map += ss_stride;
1574
1575 gfx->color_att_count = color_att_count;
1576 for (uint32_t i = 0; i < color_att_count; i++) {
1577 if (color_att_valid & BITFIELD_BIT(i)) {
1578 gfx->color_att[i] = (struct anv_attachment) {
1579 .surface_state.state = next_state,
1580 };
1581 next_state.offset += ss_stride;
1582 next_state.map += ss_stride;
1583 } else {
1584 gfx->color_att[i] = (struct anv_attachment) {
1585 .surface_state.state = gfx->null_surface_state,
1586 };
1587 }
1588 }
1589 gfx->depth_att = (struct anv_attachment) { };
1590 gfx->stencil_att = (struct anv_attachment) { };
1591
1592 return VK_SUCCESS;
1593 }
1594
1595 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1596 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1597 {
1598 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1599
1600 gfx->render_area = (VkRect2D) { };
1601 gfx->layer_count = 0;
1602 gfx->samples = 0;
1603
1604 gfx->color_att_count = 0;
1605 gfx->depth_att = (struct anv_attachment) { };
1606 gfx->stencil_att = (struct anv_attachment) { };
1607 gfx->null_surface_state = ANV_STATE_NULL;
1608 }
1609
1610 VkResult
genX(BeginCommandBuffer)1611 genX(BeginCommandBuffer)(
1612 VkCommandBuffer commandBuffer,
1613 const VkCommandBufferBeginInfo* pBeginInfo)
1614 {
1615 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1616 VkResult result;
1617
1618 /* If this is the first vkBeginCommandBuffer, we must *initialize* the
1619 * command buffer's state. Otherwise, we must *reset* its state. In both
1620 * cases we reset it.
1621 *
1622 * From the Vulkan 1.0 spec:
1623 *
1624 * If a command buffer is in the executable state and the command buffer
1625 * was allocated from a command pool with the
1626 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
1627 * vkBeginCommandBuffer implicitly resets the command buffer, behaving
1628 * as if vkResetCommandBuffer had been called with
1629 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
1630 * the command buffer in the recording state.
1631 */
1632 anv_cmd_buffer_reset(cmd_buffer);
1633 anv_cmd_buffer_reset_rendering(cmd_buffer);
1634
1635 cmd_buffer->usage_flags = pBeginInfo->flags;
1636
1637 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
1638 * primary level command buffers.
1639 *
1640 * From the Vulkan 1.0 spec:
1641 *
1642 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
1643 * secondary command buffer is considered to be entirely inside a render
1644 * pass. If this is a primary command buffer, then this bit is ignored.
1645 */
1646 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1647 cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1648
1649 trace_intel_begin_cmd_buffer(&cmd_buffer->trace, cmd_buffer);
1650
1651 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1652
1653 /* We sometimes store vertex data in the dynamic state buffer for blorp
1654 * operations and our dynamic state stream may re-use data from previous
1655 * command buffers. In order to prevent stale cache data, we flush the VF
1656 * cache. We could do this on every blorp call but that's not really
1657 * needed as all of the data will get written by the CPU prior to the GPU
1658 * executing anything. The chances are fairly high that they will use
1659 * blorp at least once per primary command buffer so it shouldn't be
1660 * wasted.
1661 *
1662 * There is also a workaround on gfx8 which requires us to invalidate the
1663 * VF cache occasionally. It's easier if we can assume we start with a
1664 * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
1665 */
1666 anv_add_pending_pipe_bits(cmd_buffer,
1667 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1668 "new cmd buffer");
1669
1670 /* Re-emit the aux table register in every command buffer. This way we're
1671 * ensured that we have the table even if this command buffer doesn't
1672 * initialize any images.
1673 */
1674 if (cmd_buffer->device->info.has_aux_map) {
1675 anv_add_pending_pipe_bits(cmd_buffer,
1676 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
1677 "new cmd buffer with aux-tt");
1678 }
1679
1680 /* We send an "Indirect State Pointers Disable" packet at
1681 * EndCommandBuffer, so all push contant packets are ignored during a
1682 * context restore. Documentation says after that command, we need to
1683 * emit push constants again before any rendering operation. So we
1684 * flag them dirty here to make sure they get emitted.
1685 */
1686 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
1687
1688 if (cmd_buffer->usage_flags &
1689 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1690 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1691
1692 const VkCommandBufferInheritanceRenderingInfoKHR *inheritance_info =
1693 vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
1694 pBeginInfo);
1695
1696 /* We can't get this information from the inheritance info */
1697 gfx->render_area = (VkRect2D) { };
1698 gfx->layer_count = 0;
1699 gfx->samples = 0;
1700 gfx->depth_att = (struct anv_attachment) { };
1701 gfx->stencil_att = (struct anv_attachment) { };
1702
1703 if (inheritance_info == NULL) {
1704 gfx->rendering_flags = 0;
1705 gfx->view_mask = 0;
1706 gfx->samples = 0;
1707 result = anv_cmd_buffer_init_attachments(cmd_buffer, 0, 0);
1708 if (result != VK_SUCCESS)
1709 return result;
1710 } else {
1711 gfx->rendering_flags = inheritance_info->flags;
1712 gfx->view_mask = inheritance_info->viewMask;
1713 gfx->samples = inheritance_info->rasterizationSamples;
1714
1715 uint32_t color_att_valid = 0;
1716 uint32_t color_att_count = inheritance_info->colorAttachmentCount;
1717 for (uint32_t i = 0; i < color_att_count; i++) {
1718 VkFormat format = inheritance_info->pColorAttachmentFormats[i];
1719 if (format != VK_FORMAT_UNDEFINED)
1720 color_att_valid |= BITFIELD_BIT(i);
1721 }
1722 result = anv_cmd_buffer_init_attachments(cmd_buffer,
1723 color_att_count,
1724 color_att_valid);
1725 if (result != VK_SUCCESS)
1726 return result;
1727
1728 for (uint32_t i = 0; i < color_att_count; i++) {
1729 gfx->color_att[i].vk_format =
1730 inheritance_info->pColorAttachmentFormats[i];
1731 }
1732 gfx->depth_att.vk_format =
1733 inheritance_info->depthAttachmentFormat;
1734 gfx->stencil_att.vk_format =
1735 inheritance_info->stencilAttachmentFormat;
1736 }
1737
1738 /* Try to figure out the depth buffer if we can */
1739 if (pBeginInfo->pInheritanceInfo->renderPass != VK_NULL_HANDLE &&
1740 pBeginInfo->pInheritanceInfo->framebuffer != VK_NULL_HANDLE) {
1741 VK_FROM_HANDLE(vk_render_pass, pass,
1742 pBeginInfo->pInheritanceInfo->renderPass);
1743 VK_FROM_HANDLE(vk_framebuffer, fb,
1744 pBeginInfo->pInheritanceInfo->framebuffer);
1745 const struct vk_subpass *subpass =
1746 &pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
1747
1748 if (!(fb->flags & VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT_KHR) &&
1749 subpass->depth_stencil_attachment != NULL) {
1750 const struct vk_subpass_attachment *att =
1751 subpass->depth_stencil_attachment;
1752
1753 assert(att->attachment < fb->attachment_count);
1754 ANV_FROM_HANDLE(anv_image_view, iview,
1755 fb->attachments[att->attachment]);
1756
1757 if (iview->vk.image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
1758 assert(gfx->depth_att.vk_format == iview->vk.format);
1759 gfx->depth_att.iview = iview;
1760 gfx->depth_att.layout = att->layout;
1761 gfx->depth_att.aux_usage =
1762 anv_layout_to_aux_usage(&cmd_buffer->device->info,
1763 iview->image,
1764 VK_IMAGE_ASPECT_DEPTH_BIT,
1765 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
1766 att->layout);
1767 }
1768
1769 if (iview->vk.image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
1770 assert(gfx->stencil_att.vk_format == iview->vk.format);
1771 gfx->stencil_att.iview = iview;
1772 gfx->stencil_att.layout = att->stencil_layout;
1773 gfx->stencil_att.aux_usage =
1774 anv_layout_to_aux_usage(&cmd_buffer->device->info,
1775 iview->image,
1776 VK_IMAGE_ASPECT_STENCIL_BIT,
1777 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
1778 att->stencil_layout);
1779 }
1780 }
1781 }
1782
1783 if (gfx->depth_att.iview != NULL) {
1784 cmd_buffer->state.hiz_enabled =
1785 isl_aux_usage_has_hiz(gfx->depth_att.aux_usage);
1786 }
1787
1788 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1789 }
1790
1791 #if GFX_VERx10 >= 75
1792 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1793 const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
1794 vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
1795
1796 /* If secondary buffer supports conditional rendering
1797 * we should emit commands as if conditional rendering is enabled.
1798 */
1799 cmd_buffer->state.conditional_render_enabled =
1800 conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
1801 }
1802 #endif
1803
1804 return VK_SUCCESS;
1805 }
1806
1807 /* From the PRM, Volume 2a:
1808 *
1809 * "Indirect State Pointers Disable
1810 *
1811 * At the completion of the post-sync operation associated with this pipe
1812 * control packet, the indirect state pointers in the hardware are
1813 * considered invalid; the indirect pointers are not saved in the context.
1814 * If any new indirect state commands are executed in the command stream
1815 * while the pipe control is pending, the new indirect state commands are
1816 * preserved.
1817 *
1818 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
1819 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
1820 * commands are only considered as Indirect State Pointers. Once ISP is
1821 * issued in a context, SW must initialize by programming push constant
1822 * commands for all the shaders (at least to zero length) before attempting
1823 * any rendering operation for the same context."
1824 *
1825 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
1826 * even though they point to a BO that has been already unreferenced at
1827 * the end of the previous batch buffer. This has been fine so far since
1828 * we are protected by these scratch page (every address not covered by
1829 * a BO should be pointing to the scratch page). But on CNL, it is
1830 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
1831 * instruction.
1832 *
1833 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
1834 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
1835 * context restore, so the mentioned hang doesn't happen. However,
1836 * software must program push constant commands for all stages prior to
1837 * rendering anything. So we flag them dirty in BeginCommandBuffer.
1838 *
1839 * Finally, we also make sure to stall at pixel scoreboard to make sure the
1840 * constants have been loaded into the EUs prior to disable the push constants
1841 * so that it doesn't hang a previous 3DPRIMITIVE.
1842 */
1843 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)1844 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
1845 {
1846 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1847 pc.StallAtPixelScoreboard = true;
1848 pc.CommandStreamerStallEnable = true;
1849 anv_debug_dump_pc(pc);
1850 }
1851 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1852 pc.IndirectStatePointersDisable = true;
1853 pc.CommandStreamerStallEnable = true;
1854 anv_debug_dump_pc(pc);
1855 }
1856 }
1857
1858 VkResult
genX(EndCommandBuffer)1859 genX(EndCommandBuffer)(
1860 VkCommandBuffer commandBuffer)
1861 {
1862 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1863
1864 if (anv_batch_has_error(&cmd_buffer->batch))
1865 return cmd_buffer->batch.status;
1866
1867 anv_measure_endcommandbuffer(cmd_buffer);
1868
1869 /* We want every command buffer to start with the PMA fix in a known state,
1870 * so we disable it at the end of the command buffer.
1871 */
1872 genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
1873
1874 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1875
1876 emit_isp_disable(cmd_buffer);
1877
1878 trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer,
1879 cmd_buffer->vk.level);
1880
1881 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
1882
1883 return VK_SUCCESS;
1884 }
1885
1886 void
genX(CmdExecuteCommands)1887 genX(CmdExecuteCommands)(
1888 VkCommandBuffer commandBuffer,
1889 uint32_t commandBufferCount,
1890 const VkCommandBuffer* pCmdBuffers)
1891 {
1892 ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
1893
1894 assert(primary->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1895
1896 if (anv_batch_has_error(&primary->batch))
1897 return;
1898
1899 /* The secondary command buffers will assume that the PMA fix is disabled
1900 * when they begin executing. Make sure this is true.
1901 */
1902 genX(cmd_buffer_enable_pma_fix)(primary, false);
1903
1904 /* The secondary command buffer doesn't know which textures etc. have been
1905 * flushed prior to their execution. Apply those flushes now.
1906 */
1907 genX(cmd_buffer_apply_pipe_flushes)(primary);
1908
1909 for (uint32_t i = 0; i < commandBufferCount; i++) {
1910 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
1911
1912 assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1913 assert(!anv_batch_has_error(&secondary->batch));
1914
1915 #if GFX_VERx10 >= 75
1916 if (secondary->state.conditional_render_enabled) {
1917 if (!primary->state.conditional_render_enabled) {
1918 /* Secondary buffer is constructed as if it will be executed
1919 * with conditional rendering, we should satisfy this dependency
1920 * regardless of conditional rendering being enabled in primary.
1921 */
1922 struct mi_builder b;
1923 mi_builder_init(&b, &primary->device->info, &primary->batch);
1924 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
1925 mi_imm(UINT64_MAX));
1926 }
1927 }
1928 #endif
1929
1930 if (secondary->usage_flags &
1931 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1932 /* If we're continuing a render pass from the primary, we need to
1933 * copy the surface states for the current subpass into the storage
1934 * we allocated for them in BeginCommandBuffer.
1935 */
1936 struct anv_bo *ss_bo =
1937 primary->device->surface_state_pool.block_pool.bo;
1938 struct anv_state src_state = primary->state.gfx.att_states;
1939 struct anv_state dst_state = secondary->state.gfx.att_states;
1940 assert(src_state.alloc_size == dst_state.alloc_size);
1941
1942 genX(cmd_buffer_so_memcpy)(primary,
1943 (struct anv_address) {
1944 .bo = ss_bo,
1945 .offset = dst_state.offset,
1946 },
1947 (struct anv_address) {
1948 .bo = ss_bo,
1949 .offset = src_state.offset,
1950 },
1951 src_state.alloc_size);
1952 }
1953
1954 anv_cmd_buffer_add_secondary(primary, secondary);
1955
1956 assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
1957 secondary->perf_query_pool == primary->perf_query_pool);
1958 if (secondary->perf_query_pool)
1959 primary->perf_query_pool = secondary->perf_query_pool;
1960
1961 #if GFX_VERx10 == 120
1962 if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
1963 primary->state.depth_reg_mode = secondary->state.depth_reg_mode;
1964 #endif
1965 }
1966
1967 /* The secondary isn't counted in our VF cache tracking so we need to
1968 * invalidate the whole thing.
1969 */
1970 if (GFX_VER >= 8 && GFX_VER <= 9) {
1971 anv_add_pending_pipe_bits(primary,
1972 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1973 "Secondary cmd buffer not tracked in VF cache");
1974 }
1975
1976 /* The secondary may have selected a different pipeline (3D or compute) and
1977 * may have changed the current L3$ configuration. Reset our tracking
1978 * variables to invalid values to ensure that we re-emit these in the case
1979 * where we do any draws or compute dispatches from the primary after the
1980 * secondary has returned.
1981 */
1982 primary->state.current_pipeline = UINT32_MAX;
1983 primary->state.current_l3_config = NULL;
1984 primary->state.current_hash_scale = 0;
1985
1986 /* Each of the secondary command buffers will use its own state base
1987 * address. We need to re-emit state base address for the primary after
1988 * all of the secondaries are done.
1989 *
1990 * TODO: Maybe we want to make this a dirty bit to avoid extra state base
1991 * address calls?
1992 */
1993 genX(cmd_buffer_emit_state_base_address)(primary);
1994 }
1995
1996 /**
1997 * Program the hardware to use the specified L3 configuration.
1998 */
1999 void
genX(cmd_buffer_config_l3)2000 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
2001 const struct intel_l3_config *cfg)
2002 {
2003 assert(cfg || GFX_VER >= 12);
2004 if (cfg == cmd_buffer->state.current_l3_config)
2005 return;
2006
2007 #if GFX_VER >= 11
2008 /* On Gfx11+ we use only one config, so verify it remains the same and skip
2009 * the stalling programming entirely.
2010 */
2011 assert(cfg == cmd_buffer->device->l3_config);
2012 #else
2013 if (INTEL_DEBUG(DEBUG_L3)) {
2014 mesa_logd("L3 config transition: ");
2015 intel_dump_l3_config(cfg, stderr);
2016 }
2017
2018 /* According to the hardware docs, the L3 partitioning can only be changed
2019 * while the pipeline is completely drained and the caches are flushed,
2020 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
2021 */
2022 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2023 pc.DCFlushEnable = true;
2024 pc.PostSyncOperation = NoWrite;
2025 pc.CommandStreamerStallEnable = true;
2026 anv_debug_dump_pc(pc);
2027 }
2028
2029 /* ...followed by a second pipelined PIPE_CONTROL that initiates
2030 * invalidation of the relevant caches. Note that because RO invalidation
2031 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
2032 * command is processed by the CS) we cannot combine it with the previous
2033 * stalling flush as the hardware documentation suggests, because that
2034 * would cause the CS to stall on previous rendering *after* RO
2035 * invalidation and wouldn't prevent the RO caches from being polluted by
2036 * concurrent rendering before the stall completes. This intentionally
2037 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
2038 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
2039 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
2040 * already guarantee that there is no concurrent GPGPU kernel execution
2041 * (see SKL HSD 2132585).
2042 */
2043 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2044 pc.TextureCacheInvalidationEnable = true;
2045 pc.ConstantCacheInvalidationEnable = true;
2046 pc.InstructionCacheInvalidateEnable = true;
2047 pc.StateCacheInvalidationEnable = true;
2048 pc.PostSyncOperation = NoWrite;
2049 anv_debug_dump_pc(pc);
2050 }
2051
2052 /* Now send a third stalling flush to make sure that invalidation is
2053 * complete when the L3 configuration registers are modified.
2054 */
2055 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2056 pc.DCFlushEnable = true;
2057 pc.PostSyncOperation = NoWrite;
2058 pc.CommandStreamerStallEnable = true;
2059 anv_debug_dump_pc(pc);
2060 }
2061
2062 genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
2063 #endif /* GFX_VER >= 11 */
2064 cmd_buffer->state.current_l3_config = cfg;
2065 }
2066
2067 enum anv_pipe_bits
genX(emit_apply_pipe_flushes)2068 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
2069 struct anv_device *device,
2070 uint32_t current_pipeline,
2071 enum anv_pipe_bits bits)
2072 {
2073 /*
2074 * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
2075 *
2076 * Write synchronization is a special case of end-of-pipe
2077 * synchronization that requires that the render cache and/or depth
2078 * related caches are flushed to memory, where the data will become
2079 * globally visible. This type of synchronization is required prior to
2080 * SW (CPU) actually reading the result data from memory, or initiating
2081 * an operation that will use as a read surface (such as a texture
2082 * surface) a previous render target and/or depth/stencil buffer
2083 *
2084 *
2085 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
2086 *
2087 * Exercising the write cache flush bits (Render Target Cache Flush
2088 * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
2089 * ensures the write caches are flushed and doesn't guarantee the data
2090 * is globally visible.
2091 *
2092 * SW can track the completion of the end-of-pipe-synchronization by
2093 * using "Notify Enable" and "PostSync Operation - Write Immediate
2094 * Data" in the PIPE_CONTROL command.
2095 *
2096 * In other words, flushes are pipelined while invalidations are handled
2097 * immediately. Therefore, if we're flushing anything then we need to
2098 * schedule an end-of-pipe sync before any invalidations can happen.
2099 */
2100 if (bits & ANV_PIPE_FLUSH_BITS)
2101 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2102
2103
2104 /* HSD 1209978178: docs say that before programming the aux table:
2105 *
2106 * "Driver must ensure that the engine is IDLE but ensure it doesn't
2107 * add extra flushes in the case it knows that the engine is already
2108 * IDLE."
2109 */
2110 if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT))
2111 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2112
2113 /* If we're going to do an invalidate and we have a pending end-of-pipe
2114 * sync that has yet to be resolved, we do the end-of-pipe sync now.
2115 */
2116 if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
2117 (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
2118 bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
2119 bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2120 }
2121
2122 /* Wa_1409226450, Wait for EU to be idle before pipe control which
2123 * invalidates the instruction cache
2124 */
2125 if (GFX_VER == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT))
2126 bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2127
2128 /* Project: SKL / Argument: LRI Post Sync Operation [23]
2129 *
2130 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
2131 * programmed prior to programming a PIPECONTROL command with "LRI
2132 * Post Sync Operation" in GPGPU mode of operation (i.e when
2133 * PIPELINE_SELECT command is set to GPGPU mode of operation)."
2134 *
2135 * The same text exists a few rows below for Post Sync Op.
2136 */
2137 if (bits & ANV_PIPE_POST_SYNC_BIT) {
2138 if (GFX_VER == 9 && current_pipeline == GPGPU)
2139 bits |= ANV_PIPE_CS_STALL_BIT;
2140 bits &= ~ANV_PIPE_POST_SYNC_BIT;
2141 }
2142
2143 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
2144 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
2145 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2146 #if GFX_VER >= 12
2147 pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
2148 pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2149 #else
2150 /* Flushing HDC pipeline requires DC Flush on earlier HW. */
2151 pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2152 #endif
2153 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
2154 pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2155 pipe.RenderTargetCacheFlushEnable =
2156 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
2157
2158 /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
2159 * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
2160 */
2161 #if GFX_VER >= 12
2162 pipe.DepthStallEnable =
2163 pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT);
2164 #else
2165 pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
2166 #endif
2167
2168 #if GFX_VERx10 >= 125
2169 pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
2170 #endif
2171
2172 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
2173 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2174
2175 /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
2176 *
2177 * "The most common action to perform upon reaching a
2178 * synchronization point is to write a value out to memory. An
2179 * immediate value (included with the synchronization command) may
2180 * be written."
2181 *
2182 *
2183 * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
2184 *
2185 * "In case the data flushed out by the render engine is to be
2186 * read back in to the render engine in coherent manner, then the
2187 * render engine has to wait for the fence completion before
2188 * accessing the flushed data. This can be achieved by following
2189 * means on various products: PIPE_CONTROL command with CS Stall
2190 * and the required write caches flushed with Post-Sync-Operation
2191 * as Write Immediate Data.
2192 *
2193 * Example:
2194 * - Workload-1 (3D/GPGPU/MEDIA)
2195 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
2196 * Immediate Data, Required Write Cache Flush bits set)
2197 * - Workload-2 (Can use the data produce or output by
2198 * Workload-1)
2199 */
2200 if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
2201 pipe.CommandStreamerStallEnable = true;
2202 pipe.PostSyncOperation = WriteImmediateData;
2203 pipe.Address = device->workaround_address;
2204 }
2205
2206 /*
2207 * According to the Broadwell documentation, any PIPE_CONTROL with the
2208 * "Command Streamer Stall" bit set must also have another bit set,
2209 * with five different options:
2210 *
2211 * - Render Target Cache Flush
2212 * - Depth Cache Flush
2213 * - Stall at Pixel Scoreboard
2214 * - Post-Sync Operation
2215 * - Depth Stall
2216 * - DC Flush Enable
2217 *
2218 * I chose "Stall at Pixel Scoreboard" since that's what we use in
2219 * mesa and it seems to work fine. The choice is fairly arbitrary.
2220 */
2221 if (pipe.CommandStreamerStallEnable &&
2222 !pipe.RenderTargetCacheFlushEnable &&
2223 !pipe.DepthCacheFlushEnable &&
2224 !pipe.StallAtPixelScoreboard &&
2225 !pipe.PostSyncOperation &&
2226 !pipe.DepthStallEnable &&
2227 !pipe.DCFlushEnable)
2228 pipe.StallAtPixelScoreboard = true;
2229 anv_debug_dump_pc(pipe);
2230 }
2231
2232 /* If a render target flush was emitted, then we can toggle off the bit
2233 * saying that render target writes are ongoing.
2234 */
2235 if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
2236 bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
2237
2238 if (GFX_VERx10 == 75) {
2239 /* Haswell needs addition work-arounds:
2240 *
2241 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
2242 *
2243 * Option 1:
2244 * PIPE_CONTROL command with the CS Stall and the required write
2245 * caches flushed with Post-SyncOperation as Write Immediate Data
2246 * followed by eight dummy MI_STORE_DATA_IMM (write to scratch
2247 * spce) commands.
2248 *
2249 * Example:
2250 * - Workload-1
2251 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
2252 * Immediate Data, Required Write Cache Flush bits set)
2253 * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
2254 * - Workload-2 (Can use the data produce or output by
2255 * Workload-1)
2256 *
2257 * Unfortunately, both the PRMs and the internal docs are a bit
2258 * out-of-date in this regard. What the windows driver does (and
2259 * this appears to actually work) is to emit a register read from the
2260 * memory address written by the pipe control above.
2261 *
2262 * What register we load into doesn't matter. We choose an indirect
2263 * rendering register because we know it always exists and it's one
2264 * of the first registers the command parser allows us to write. If
2265 * you don't have command parser support in your kernel (pre-4.2),
2266 * this will get turned into MI_NOOP and you won't get the
2267 * workaround. Unfortunately, there's just not much we can do in
2268 * that case. This register is perfectly safe to write since we
2269 * always re-load all of the indirect draw registers right before
2270 * 3DPRIMITIVE when needed anyway.
2271 */
2272 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2273 lrm.RegisterAddress = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
2274 lrm.MemoryAddress = device->workaround_address;
2275 }
2276 }
2277
2278 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
2279 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2280 }
2281
2282 if (bits & ANV_PIPE_INVALIDATE_BITS) {
2283 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2284 *
2285 * "If the VF Cache Invalidation Enable is set to a 1 in a
2286 * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
2287 * 0, with the VF Cache Invalidation Enable set to 0 needs to be sent
2288 * prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
2289 * a 1."
2290 *
2291 * This appears to hang Broadwell, so we restrict it to just gfx9.
2292 */
2293 if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
2294 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
2295
2296 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2297 pipe.StateCacheInvalidationEnable =
2298 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
2299 pipe.ConstantCacheInvalidationEnable =
2300 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
2301 #if GFX_VER >= 12
2302 /* Invalidates the L3 cache part in which index & vertex data is loaded
2303 * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
2304 */
2305 pipe.L3ReadOnlyCacheInvalidationEnable =
2306 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2307 #endif
2308 pipe.VFCacheInvalidationEnable =
2309 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2310 pipe.TextureCacheInvalidationEnable =
2311 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
2312 pipe.InstructionCacheInvalidateEnable =
2313 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
2314
2315 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2316 *
2317 * "When VF Cache Invalidate is set “Post Sync Operation” must be
2318 * enabled to “Write Immediate Data” or “Write PS Depth Count” or
2319 * “Write Timestamp”.
2320 */
2321 if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) {
2322 pipe.PostSyncOperation = WriteImmediateData;
2323 pipe.Address = device->workaround_address;
2324 }
2325 anv_debug_dump_pc(pipe);
2326 }
2327
2328 #if GFX_VER == 12
2329 if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info.has_aux_map) {
2330 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2331 lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);
2332 lri.DataDWord = 1;
2333 }
2334 }
2335 #endif
2336
2337 bits &= ~ANV_PIPE_INVALIDATE_BITS;
2338 }
2339
2340 return bits;
2341 }
2342
2343 void
genX(cmd_buffer_apply_pipe_flushes)2344 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
2345 {
2346 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
2347
2348 if (unlikely(cmd_buffer->device->physical->always_flush_cache))
2349 bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
2350 else if (bits == 0)
2351 return;
2352
2353 bool trace_flush =
2354 (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS)) != 0;
2355 if (trace_flush)
2356 trace_intel_begin_stall(&cmd_buffer->trace, cmd_buffer);
2357
2358 if ((GFX_VER >= 8 && GFX_VER <= 9) &&
2359 (bits & ANV_PIPE_CS_STALL_BIT) &&
2360 (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
2361 /* If we are doing a VF cache invalidate AND a CS stall (it must be
2362 * both) then we can reset our vertex cache tracking.
2363 */
2364 memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
2365 sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
2366 memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
2367 sizeof(cmd_buffer->state.gfx.ib_dirty_range));
2368 }
2369
2370 cmd_buffer->state.pending_pipe_bits =
2371 genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
2372 cmd_buffer->device,
2373 cmd_buffer->state.current_pipeline,
2374 bits);
2375
2376 if (trace_flush) {
2377 trace_intel_end_stall(&cmd_buffer->trace, cmd_buffer, bits,
2378 anv_pipe_flush_bit_to_ds_stall_flag, NULL);
2379 }
2380 }
2381
2382 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfoKHR * dep_info,const char * reason)2383 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
2384 const VkDependencyInfoKHR *dep_info,
2385 const char *reason)
2386 {
2387 /* XXX: Right now, we're really dumb and just flush whatever categories
2388 * the app asks for. One of these days we may make this a bit better
2389 * but right now that's all the hardware allows for in most areas.
2390 */
2391 VkAccessFlags2KHR src_flags = 0;
2392 VkAccessFlags2KHR dst_flags = 0;
2393
2394 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
2395 src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
2396 dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
2397 }
2398
2399 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
2400 src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask;
2401 dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask;
2402 }
2403
2404 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
2405 const VkImageMemoryBarrier2KHR *img_barrier =
2406 &dep_info->pImageMemoryBarriers[i];
2407
2408 src_flags |= img_barrier->srcAccessMask;
2409 dst_flags |= img_barrier->dstAccessMask;
2410
2411 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
2412 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
2413
2414 uint32_t base_layer, layer_count;
2415 if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
2416 base_layer = 0;
2417 layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel);
2418 } else {
2419 base_layer = range->baseArrayLayer;
2420 layer_count = vk_image_subresource_layer_count(&image->vk, range);
2421 }
2422 const uint32_t level_count =
2423 vk_image_subresource_level_count(&image->vk, range);
2424
2425 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2426 transition_depth_buffer(cmd_buffer, image,
2427 base_layer, layer_count,
2428 img_barrier->oldLayout,
2429 img_barrier->newLayout,
2430 false /* will_full_fast_clear */);
2431 }
2432
2433 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2434 transition_stencil_buffer(cmd_buffer, image,
2435 range->baseMipLevel, level_count,
2436 base_layer, layer_count,
2437 img_barrier->oldLayout,
2438 img_barrier->newLayout,
2439 false /* will_full_fast_clear */);
2440 }
2441
2442 if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
2443 VkImageAspectFlags color_aspects =
2444 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
2445 anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
2446 transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
2447 range->baseMipLevel, level_count,
2448 base_layer, layer_count,
2449 img_barrier->oldLayout,
2450 img_barrier->newLayout,
2451 img_barrier->srcQueueFamilyIndex,
2452 img_barrier->dstQueueFamilyIndex,
2453 false /* will_full_fast_clear */);
2454 }
2455 }
2456 }
2457
2458 enum anv_pipe_bits bits =
2459 anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
2460 anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
2461
2462 anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
2463 }
2464
genX(CmdPipelineBarrier2KHR)2465 void genX(CmdPipelineBarrier2KHR)(
2466 VkCommandBuffer commandBuffer,
2467 const VkDependencyInfoKHR* pDependencyInfo)
2468 {
2469 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2470
2471 cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
2472 }
2473
2474 static void
cmd_buffer_alloc_push_constants(struct anv_cmd_buffer * cmd_buffer)2475 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
2476 {
2477 VkShaderStageFlags stages =
2478 cmd_buffer->state.gfx.pipeline->active_stages;
2479
2480 /* In order to avoid thrash, we assume that vertex and fragment stages
2481 * always exist. In the rare case where one is missing *and* the other
2482 * uses push concstants, this may be suboptimal. However, avoiding stalls
2483 * seems more important.
2484 */
2485 stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
2486 if (anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline))
2487 stages |= VK_SHADER_STAGE_VERTEX_BIT;
2488
2489 if (stages == cmd_buffer->state.gfx.push_constant_stages)
2490 return;
2491
2492 const unsigned push_constant_kb =
2493 cmd_buffer->device->info.max_constant_urb_size_kb;
2494
2495 const unsigned num_stages =
2496 util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
2497 unsigned size_per_stage = push_constant_kb / num_stages;
2498
2499 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
2500 * units of 2KB. Incidentally, these are the same platforms that have
2501 * 32KB worth of push constant space.
2502 */
2503 if (push_constant_kb == 32)
2504 size_per_stage &= ~1u;
2505
2506 uint32_t kb_used = 0;
2507 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
2508 unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
2509 anv_batch_emit(&cmd_buffer->batch,
2510 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
2511 alloc._3DCommandSubOpcode = 18 + i;
2512 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
2513 alloc.ConstantBufferSize = push_size;
2514 }
2515 kb_used += push_size;
2516 }
2517
2518 anv_batch_emit(&cmd_buffer->batch,
2519 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
2520 alloc.ConstantBufferOffset = kb_used;
2521 alloc.ConstantBufferSize = push_constant_kb - kb_used;
2522 }
2523
2524 #if GFX_VERx10 == 125
2525 /* Wa_22011440098
2526 *
2527 * In 3D mode, after programming push constant alloc command immediately
2528 * program push constant command(ZERO length) without any commit between
2529 * them.
2530 */
2531 if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
2532 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
2533 c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
2534 }
2535 }
2536 #endif
2537
2538 cmd_buffer->state.gfx.push_constant_stages = stages;
2539
2540 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
2541 *
2542 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
2543 * the next 3DPRIMITIVE command after programming the
2544 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
2545 *
2546 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
2547 * pipeline setup, we need to dirty push constants.
2548 */
2549 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2550 }
2551
2552 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)2553 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2554 struct anv_cmd_pipeline_state *pipe_state,
2555 struct anv_shader_bin *shader,
2556 struct anv_state *bt_state)
2557 {
2558 uint32_t state_offset;
2559
2560 struct anv_pipeline_bind_map *map = &shader->bind_map;
2561 if (map->surface_count == 0) {
2562 *bt_state = (struct anv_state) { 0, };
2563 return VK_SUCCESS;
2564 }
2565
2566 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2567 map->surface_count,
2568 &state_offset);
2569 uint32_t *bt_map = bt_state->map;
2570
2571 if (bt_state->map == NULL)
2572 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2573
2574 /* We only need to emit relocs if we're not using softpin. If we are using
2575 * softpin then we always keep all user-allocated memory objects resident.
2576 */
2577 const bool need_client_mem_relocs =
2578 anv_use_relocations(cmd_buffer->device->physical);
2579 struct anv_push_constants *push = &pipe_state->push_constants;
2580
2581 for (uint32_t s = 0; s < map->surface_count; s++) {
2582 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2583
2584 struct anv_state surface_state;
2585
2586 switch (binding->set) {
2587 case ANV_DESCRIPTOR_SET_NULL:
2588 bt_map[s] = 0;
2589 break;
2590
2591 case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2592 /* Color attachment binding */
2593 assert(shader->stage == MESA_SHADER_FRAGMENT);
2594 if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2595 const struct anv_attachment *att =
2596 &cmd_buffer->state.gfx.color_att[binding->index];
2597 surface_state = att->surface_state.state;
2598 } else {
2599 surface_state = cmd_buffer->state.gfx.null_surface_state;
2600 }
2601 assert(surface_state.map);
2602 bt_map[s] = surface_state.offset + state_offset;
2603 break;
2604
2605 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
2606 struct anv_state surface_state =
2607 anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2608
2609 struct anv_address constant_data = {
2610 .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2611 .offset = shader->kernel.offset +
2612 shader->prog_data->const_data_offset,
2613 };
2614 unsigned constant_data_size = shader->prog_data->const_data_size;
2615
2616 const enum isl_format format =
2617 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2618 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2619 anv_fill_buffer_surface_state(cmd_buffer->device,
2620 surface_state, format,
2621 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2622 constant_data, constant_data_size, 1);
2623
2624 assert(surface_state.map);
2625 bt_map[s] = surface_state.offset + state_offset;
2626 add_surface_reloc(cmd_buffer, surface_state, constant_data);
2627 break;
2628 }
2629
2630 case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2631 /* This is always the first binding for compute shaders */
2632 assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2633
2634 struct anv_state surface_state =
2635 anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2636
2637 const enum isl_format format =
2638 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2639 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2640 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2641 format,
2642 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2643 cmd_buffer->state.compute.num_workgroups,
2644 12, 1);
2645
2646 assert(surface_state.map);
2647 bt_map[s] = surface_state.offset + state_offset;
2648 if (need_client_mem_relocs) {
2649 add_surface_reloc(cmd_buffer, surface_state,
2650 cmd_buffer->state.compute.num_workgroups);
2651 }
2652 break;
2653 }
2654
2655 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2656 /* This is a descriptor set buffer so the set index is actually
2657 * given by binding->binding. (Yes, that's confusing.)
2658 */
2659 struct anv_descriptor_set *set =
2660 pipe_state->descriptors[binding->index];
2661 assert(set->desc_mem.alloc_size);
2662 assert(set->desc_surface_state.alloc_size);
2663 bt_map[s] = set->desc_surface_state.offset + state_offset;
2664 add_surface_reloc(cmd_buffer, set->desc_surface_state,
2665 anv_descriptor_set_address(set));
2666 break;
2667 }
2668
2669 default: {
2670 assert(binding->set < MAX_SETS);
2671 const struct anv_descriptor_set *set =
2672 pipe_state->descriptors[binding->set];
2673 if (binding->index >= set->descriptor_count) {
2674 /* From the Vulkan spec section entitled "DescriptorSet and
2675 * Binding Assignment":
2676 *
2677 * "If the array is runtime-sized, then array elements greater
2678 * than or equal to the size of that binding in the bound
2679 * descriptor set must not be used."
2680 *
2681 * Unfortunately, the compiler isn't smart enough to figure out
2682 * when a dynamic binding isn't used so it may grab the whole
2683 * array and stick it in the binding table. In this case, it's
2684 * safe to just skip those bindings that are OOB.
2685 */
2686 assert(binding->index < set->layout->descriptor_count);
2687 continue;
2688 }
2689 const struct anv_descriptor *desc = &set->descriptors[binding->index];
2690
2691 switch (desc->type) {
2692 case VK_DESCRIPTOR_TYPE_SAMPLER:
2693 /* Nothing for us to do here */
2694 continue;
2695
2696 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2697 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2698 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
2699 if (desc->image_view) {
2700 struct anv_surface_state sstate =
2701 (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
2702 desc->image_view->planes[binding->plane].general_sampler_surface_state :
2703 desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
2704 surface_state = sstate.state;
2705 assert(surface_state.alloc_size);
2706 if (need_client_mem_relocs)
2707 add_surface_state_relocs(cmd_buffer, sstate);
2708 } else {
2709 surface_state = cmd_buffer->device->null_surface_state;
2710 }
2711 break;
2712 }
2713
2714 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
2715 if (desc->image_view) {
2716 struct anv_surface_state sstate =
2717 binding->lowered_storage_surface
2718 ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
2719 : desc->image_view->planes[binding->plane].storage_surface_state;
2720 surface_state = sstate.state;
2721 assert(surface_state.alloc_size);
2722 if (surface_state.offset == 0) {
2723 mesa_loge("Bound a image to a descriptor where the "
2724 "descriptor does not have NonReadable "
2725 "set and the image does not have a "
2726 "corresponding SPIR-V format enum.");
2727 vk_debug_report(&cmd_buffer->device->physical->instance->vk,
2728 VK_DEBUG_REPORT_ERROR_BIT_EXT,
2729 &desc->image_view->vk.base,
2730 __LINE__, 0, "anv",
2731 "Bound a image to a descriptor where the "
2732 "descriptor does not have NonReadable "
2733 "set and the image does not have a "
2734 "corresponding SPIR-V format enum.");
2735 }
2736 if (surface_state.offset && need_client_mem_relocs)
2737 add_surface_state_relocs(cmd_buffer, sstate);
2738 } else {
2739 surface_state = cmd_buffer->device->null_surface_state;
2740 }
2741 break;
2742 }
2743
2744 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2745 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2746 if (desc->set_buffer_view) {
2747 surface_state = desc->set_buffer_view->surface_state;
2748 assert(surface_state.alloc_size);
2749 if (need_client_mem_relocs) {
2750 add_surface_reloc(cmd_buffer, surface_state,
2751 desc->set_buffer_view->address);
2752 }
2753 } else {
2754 surface_state = cmd_buffer->device->null_surface_state;
2755 }
2756 break;
2757
2758 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2759 if (desc->buffer_view) {
2760 surface_state = desc->buffer_view->surface_state;
2761 assert(surface_state.alloc_size);
2762 if (need_client_mem_relocs) {
2763 add_surface_reloc(cmd_buffer, surface_state,
2764 desc->buffer_view->address);
2765 }
2766 } else {
2767 surface_state = cmd_buffer->device->null_surface_state;
2768 }
2769 break;
2770
2771 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2772 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2773 if (desc->buffer) {
2774 /* Compute the offset within the buffer */
2775 uint32_t dynamic_offset =
2776 push->dynamic_offsets[binding->dynamic_offset_index];
2777 uint64_t offset = desc->offset + dynamic_offset;
2778 /* Clamp to the buffer size */
2779 offset = MIN2(offset, desc->buffer->size);
2780 /* Clamp the range to the buffer size */
2781 uint32_t range = MIN2(desc->range, desc->buffer->size - offset);
2782
2783 /* Align the range for consistency */
2784 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
2785 range = align_u32(range, ANV_UBO_ALIGNMENT);
2786
2787 struct anv_address address =
2788 anv_address_add(desc->buffer->address, offset);
2789
2790 surface_state =
2791 anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
2792 enum isl_format format =
2793 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2794 desc->type);
2795
2796 isl_surf_usage_flags_t usage =
2797 desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
2798 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
2799 ISL_SURF_USAGE_STORAGE_BIT;
2800
2801 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2802 format, usage, address, range, 1);
2803 if (need_client_mem_relocs)
2804 add_surface_reloc(cmd_buffer, surface_state, address);
2805 } else {
2806 surface_state = cmd_buffer->device->null_surface_state;
2807 }
2808 break;
2809 }
2810
2811 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2812 if (desc->buffer_view) {
2813 surface_state = binding->lowered_storage_surface
2814 ? desc->buffer_view->lowered_storage_surface_state
2815 : desc->buffer_view->storage_surface_state;
2816 assert(surface_state.alloc_size);
2817 if (need_client_mem_relocs) {
2818 add_surface_reloc(cmd_buffer, surface_state,
2819 desc->buffer_view->address);
2820 }
2821 } else {
2822 surface_state = cmd_buffer->device->null_surface_state;
2823 }
2824 break;
2825
2826 default:
2827 assert(!"Invalid descriptor type");
2828 continue;
2829 }
2830 assert(surface_state.map);
2831 bt_map[s] = surface_state.offset + state_offset;
2832 break;
2833 }
2834 }
2835 }
2836
2837 return VK_SUCCESS;
2838 }
2839
2840 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2841 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2842 struct anv_cmd_pipeline_state *pipe_state,
2843 struct anv_shader_bin *shader,
2844 struct anv_state *state)
2845 {
2846 struct anv_pipeline_bind_map *map = &shader->bind_map;
2847 if (map->sampler_count == 0) {
2848 *state = (struct anv_state) { 0, };
2849 return VK_SUCCESS;
2850 }
2851
2852 uint32_t size = map->sampler_count * 16;
2853 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2854
2855 if (state->map == NULL)
2856 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2857
2858 for (uint32_t s = 0; s < map->sampler_count; s++) {
2859 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2860 const struct anv_descriptor *desc =
2861 &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2862
2863 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2864 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2865 continue;
2866
2867 struct anv_sampler *sampler = desc->sampler;
2868
2869 /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2870 * happens to be zero.
2871 */
2872 if (sampler == NULL)
2873 continue;
2874
2875 memcpy(state->map + (s * 16),
2876 sampler->state[binding->plane], sizeof(sampler->state[0]));
2877 }
2878
2879 return VK_SUCCESS;
2880 }
2881
2882 static uint32_t
flush_descriptor_sets(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const VkShaderStageFlags dirty,struct anv_shader_bin ** shaders,uint32_t num_shaders)2883 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
2884 struct anv_cmd_pipeline_state *pipe_state,
2885 const VkShaderStageFlags dirty,
2886 struct anv_shader_bin **shaders,
2887 uint32_t num_shaders)
2888 {
2889 VkShaderStageFlags flushed = 0;
2890
2891 VkResult result = VK_SUCCESS;
2892 for (uint32_t i = 0; i < num_shaders; i++) {
2893 if (!shaders[i])
2894 continue;
2895
2896 gl_shader_stage stage = shaders[i]->stage;
2897 VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2898 if ((vk_stage & dirty) == 0)
2899 continue;
2900
2901 assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2902 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2903 &cmd_buffer->state.samplers[stage]);
2904 if (result != VK_SUCCESS)
2905 break;
2906
2907 assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2908 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2909 &cmd_buffer->state.binding_tables[stage]);
2910 if (result != VK_SUCCESS)
2911 break;
2912
2913 flushed |= vk_stage;
2914 }
2915
2916 if (result != VK_SUCCESS) {
2917 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2918
2919 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2920 if (result != VK_SUCCESS)
2921 return 0;
2922
2923 /* Re-emit state base addresses so we get the new surface state base
2924 * address before we start emitting binding tables etc.
2925 */
2926 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2927
2928 /* Re-emit all active binding tables */
2929 flushed = 0;
2930
2931 for (uint32_t i = 0; i < num_shaders; i++) {
2932 if (!shaders[i])
2933 continue;
2934
2935 gl_shader_stage stage = shaders[i]->stage;
2936
2937 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2938 &cmd_buffer->state.samplers[stage]);
2939 if (result != VK_SUCCESS) {
2940 anv_batch_set_error(&cmd_buffer->batch, result);
2941 return 0;
2942 }
2943 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2944 &cmd_buffer->state.binding_tables[stage]);
2945 if (result != VK_SUCCESS) {
2946 anv_batch_set_error(&cmd_buffer->batch, result);
2947 return 0;
2948 }
2949
2950 flushed |= mesa_to_vk_shader_stage(stage);
2951 }
2952 }
2953
2954 return flushed;
2955 }
2956
2957 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)2958 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
2959 uint32_t stages)
2960 {
2961 static const uint32_t sampler_state_opcodes[] = {
2962 [MESA_SHADER_VERTEX] = 43,
2963 [MESA_SHADER_TESS_CTRL] = 44, /* HS */
2964 [MESA_SHADER_TESS_EVAL] = 45, /* DS */
2965 [MESA_SHADER_GEOMETRY] = 46,
2966 [MESA_SHADER_FRAGMENT] = 47,
2967 };
2968
2969 static const uint32_t binding_table_opcodes[] = {
2970 [MESA_SHADER_VERTEX] = 38,
2971 [MESA_SHADER_TESS_CTRL] = 39,
2972 [MESA_SHADER_TESS_EVAL] = 40,
2973 [MESA_SHADER_GEOMETRY] = 41,
2974 [MESA_SHADER_FRAGMENT] = 42,
2975 };
2976
2977 anv_foreach_stage(s, stages) {
2978 assert(s < ARRAY_SIZE(binding_table_opcodes));
2979
2980 if (cmd_buffer->state.samplers[s].alloc_size > 0) {
2981 anv_batch_emit(&cmd_buffer->batch,
2982 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
2983 ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
2984 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
2985 }
2986 }
2987
2988 /* Always emit binding table pointers if we're asked to, since on SKL
2989 * this is what flushes push constants. */
2990 anv_batch_emit(&cmd_buffer->batch,
2991 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
2992 btp._3DCommandSubOpcode = binding_table_opcodes[s];
2993 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
2994 }
2995 }
2996 }
2997
2998 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)2999 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
3000 const struct anv_shader_bin *shader,
3001 const struct anv_push_range *range)
3002 {
3003 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3004 switch (range->set) {
3005 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
3006 /* This is a descriptor set buffer so the set index is
3007 * actually given by binding->binding. (Yes, that's
3008 * confusing.)
3009 */
3010 struct anv_descriptor_set *set =
3011 gfx_state->base.descriptors[range->index];
3012 return anv_descriptor_set_address(set);
3013 }
3014
3015 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
3016 if (gfx_state->base.push_constants_state.alloc_size == 0) {
3017 gfx_state->base.push_constants_state =
3018 anv_cmd_buffer_gfx_push_constants(cmd_buffer);
3019 }
3020 return (struct anv_address) {
3021 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3022 .offset = gfx_state->base.push_constants_state.offset,
3023 };
3024 }
3025
3026 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
3027 return (struct anv_address) {
3028 .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
3029 .offset = shader->kernel.offset +
3030 shader->prog_data->const_data_offset,
3031 };
3032
3033 default: {
3034 assert(range->set < MAX_SETS);
3035 struct anv_descriptor_set *set =
3036 gfx_state->base.descriptors[range->set];
3037 const struct anv_descriptor *desc =
3038 &set->descriptors[range->index];
3039
3040 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
3041 if (desc->buffer_view)
3042 return desc->buffer_view->address;
3043 } else {
3044 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
3045 if (desc->buffer) {
3046 const struct anv_push_constants *push =
3047 &gfx_state->base.push_constants;
3048 uint32_t dynamic_offset =
3049 push->dynamic_offsets[range->dynamic_offset_index];
3050 return anv_address_add(desc->buffer->address,
3051 desc->offset + dynamic_offset);
3052 }
3053 }
3054
3055 /* For NULL UBOs, we just return an address in the workaround BO. We do
3056 * writes to it for workarounds but always at the bottom. The higher
3057 * bytes should be all zeros.
3058 */
3059 assert(range->length * 32 <= 2048);
3060 return (struct anv_address) {
3061 .bo = cmd_buffer->device->workaround_bo,
3062 .offset = 1024,
3063 };
3064 }
3065 }
3066 }
3067
3068
3069 /** Returns the size in bytes of the bound buffer
3070 *
3071 * The range is relative to the start of the buffer, not the start of the
3072 * range. The returned range may be smaller than
3073 *
3074 * (range->start + range->length) * 32;
3075 */
3076 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)3077 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
3078 const struct anv_shader_bin *shader,
3079 const struct anv_push_range *range)
3080 {
3081 assert(shader->stage != MESA_SHADER_COMPUTE);
3082 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3083 switch (range->set) {
3084 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
3085 struct anv_descriptor_set *set =
3086 gfx_state->base.descriptors[range->index];
3087 assert(range->start * 32 < set->desc_mem.alloc_size);
3088 assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
3089 return set->desc_mem.alloc_size;
3090 }
3091
3092 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
3093 return (range->start + range->length) * 32;
3094
3095 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
3096 return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
3097
3098 default: {
3099 assert(range->set < MAX_SETS);
3100 struct anv_descriptor_set *set =
3101 gfx_state->base.descriptors[range->set];
3102 const struct anv_descriptor *desc =
3103 &set->descriptors[range->index];
3104
3105 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
3106 /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
3107 * We use the descriptor set's internally allocated surface state to fill the binding table entry.
3108 */
3109 if (!desc->set_buffer_view)
3110 return 0;
3111
3112 if (range->start * 32 > desc->set_buffer_view->range)
3113 return 0;
3114
3115 return desc->set_buffer_view->range;
3116 } else {
3117 if (!desc->buffer)
3118 return 0;
3119
3120 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
3121 /* Compute the offset within the buffer */
3122 const struct anv_push_constants *push =
3123 &gfx_state->base.push_constants;
3124 uint32_t dynamic_offset =
3125 push->dynamic_offsets[range->dynamic_offset_index];
3126 uint64_t offset = desc->offset + dynamic_offset;
3127 /* Clamp to the buffer size */
3128 offset = MIN2(offset, desc->buffer->size);
3129 /* Clamp the range to the buffer size */
3130 uint32_t bound_range = MIN2(desc->range, desc->buffer->size - offset);
3131
3132 /* Align the range for consistency */
3133 bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT);
3134
3135 return bound_range;
3136 }
3137 }
3138 }
3139 }
3140
3141 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)3142 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
3143 gl_shader_stage stage,
3144 struct anv_address *buffers,
3145 unsigned buffer_count)
3146 {
3147 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3148 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3149
3150 static const uint32_t push_constant_opcodes[] = {
3151 [MESA_SHADER_VERTEX] = 21,
3152 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
3153 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
3154 [MESA_SHADER_GEOMETRY] = 22,
3155 [MESA_SHADER_FRAGMENT] = 23,
3156 };
3157
3158 assert(stage < ARRAY_SIZE(push_constant_opcodes));
3159
3160 UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
3161
3162 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
3163 c._3DCommandSubOpcode = push_constant_opcodes[stage];
3164
3165 /* Set MOCS, except on Gfx8, because the Broadwell PRM says:
3166 *
3167 * "Constant Buffer Object Control State must be always
3168 * programmed to zero."
3169 *
3170 * This restriction does not exist on any newer platforms.
3171 *
3172 * We only have one MOCS field for the whole packet, not one per
3173 * buffer. We could go out of our way here to walk over all of
3174 * the buffers and see if any of them are used externally and use
3175 * the external MOCS. However, the notion that someone would use
3176 * the same bit of memory for both scanout and a UBO is nuts.
3177 *
3178 * Let's not bother and assume it's all internal.
3179 */
3180 #if GFX_VER >= 9
3181 c.MOCS = mocs;
3182 #elif GFX_VER < 8
3183 c.ConstantBody.MOCS = mocs;
3184 #endif
3185
3186 if (anv_pipeline_has_stage(pipeline, stage)) {
3187 const struct anv_pipeline_bind_map *bind_map =
3188 &pipeline->shaders[stage]->bind_map;
3189
3190 #if GFX_VERx10 >= 75
3191 /* The Skylake PRM contains the following restriction:
3192 *
3193 * "The driver must ensure The following case does not occur
3194 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3195 * buffer 3 read length equal to zero committed followed by a
3196 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3197 * zero committed."
3198 *
3199 * To avoid this, we program the buffers in the highest slots.
3200 * This way, slot 0 is only used if slot 3 is also used.
3201 */
3202 assert(buffer_count <= 4);
3203 const unsigned shift = 4 - buffer_count;
3204 for (unsigned i = 0; i < buffer_count; i++) {
3205 const struct anv_push_range *range = &bind_map->push_ranges[i];
3206
3207 /* At this point we only have non-empty ranges */
3208 assert(range->length > 0);
3209
3210 /* For Ivy Bridge, make sure we only set the first range (actual
3211 * push constants)
3212 */
3213 assert((GFX_VERx10 >= 75) || i == 0);
3214
3215 c.ConstantBody.ReadLength[i + shift] = range->length;
3216 c.ConstantBody.Buffer[i + shift] =
3217 anv_address_add(buffers[i], range->start * 32);
3218 }
3219 #else
3220 /* For Ivy Bridge, push constants are relative to dynamic state
3221 * base address and we only ever push actual push constants.
3222 */
3223 if (bind_map->push_ranges[0].length > 0) {
3224 assert(buffer_count == 1);
3225 assert(bind_map->push_ranges[0].set ==
3226 ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
3227 assert(buffers[0].bo ==
3228 cmd_buffer->device->dynamic_state_pool.block_pool.bo);
3229 c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
3230 c.ConstantBody.Buffer[0].bo = NULL;
3231 c.ConstantBody.Buffer[0].offset = buffers[0].offset;
3232 }
3233 assert(bind_map->push_ranges[1].length == 0);
3234 assert(bind_map->push_ranges[2].length == 0);
3235 assert(bind_map->push_ranges[3].length == 0);
3236 #endif
3237 }
3238 }
3239 }
3240
3241 #if GFX_VER >= 12
3242 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer * cmd_buffer,uint32_t shader_mask,struct anv_address * buffers,uint32_t buffer_count)3243 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
3244 uint32_t shader_mask,
3245 struct anv_address *buffers,
3246 uint32_t buffer_count)
3247 {
3248 if (buffer_count == 0) {
3249 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
3250 c.ShaderUpdateEnable = shader_mask;
3251 c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
3252 }
3253 return;
3254 }
3255
3256 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3257 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3258
3259 static const UNUSED uint32_t push_constant_opcodes[] = {
3260 [MESA_SHADER_VERTEX] = 21,
3261 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
3262 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
3263 [MESA_SHADER_GEOMETRY] = 22,
3264 [MESA_SHADER_FRAGMENT] = 23,
3265 };
3266
3267 gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
3268 assert(stage < ARRAY_SIZE(push_constant_opcodes));
3269
3270 const struct anv_pipeline_bind_map *bind_map =
3271 &pipeline->shaders[stage]->bind_map;
3272
3273 uint32_t *dw;
3274 const uint32_t buffer_mask = (1 << buffer_count) - 1;
3275 const uint32_t num_dwords = 2 + 2 * buffer_count;
3276
3277 dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3278 GENX(3DSTATE_CONSTANT_ALL),
3279 .ShaderUpdateEnable = shader_mask,
3280 .PointerBufferMask = buffer_mask,
3281 .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
3282
3283 for (int i = 0; i < buffer_count; i++) {
3284 const struct anv_push_range *range = &bind_map->push_ranges[i];
3285 GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
3286 &cmd_buffer->batch, dw + 2 + i * 2,
3287 &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
3288 .PointerToConstantBuffer =
3289 anv_address_add(buffers[i], range->start * 32),
3290 .ConstantBufferReadLength = range->length,
3291 });
3292 }
3293 }
3294 #endif
3295
3296 static void
cmd_buffer_flush_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)3297 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
3298 VkShaderStageFlags dirty_stages)
3299 {
3300 VkShaderStageFlags flushed = 0;
3301 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3302 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3303
3304 #if GFX_VER >= 12
3305 uint32_t nobuffer_stages = 0;
3306 #endif
3307
3308 /* Compute robust pushed register access mask for each stage. */
3309 if (cmd_buffer->device->robust_buffer_access) {
3310 anv_foreach_stage(stage, dirty_stages) {
3311 if (!anv_pipeline_has_stage(pipeline, stage))
3312 continue;
3313
3314 const struct anv_shader_bin *shader = pipeline->shaders[stage];
3315 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3316 struct anv_push_constants *push = &gfx_state->base.push_constants;
3317
3318 push->push_reg_mask[stage] = 0;
3319 /* Start of the current range in the shader, relative to the start of
3320 * push constants in the shader.
3321 */
3322 unsigned range_start_reg = 0;
3323 for (unsigned i = 0; i < 4; i++) {
3324 const struct anv_push_range *range = &bind_map->push_ranges[i];
3325 if (range->length == 0)
3326 continue;
3327
3328 unsigned bound_size =
3329 get_push_range_bound_size(cmd_buffer, shader, range);
3330 if (bound_size >= range->start * 32) {
3331 unsigned bound_regs =
3332 MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
3333 range->length);
3334 assert(range_start_reg + bound_regs <= 64);
3335 push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
3336 bound_regs);
3337 }
3338
3339 cmd_buffer->state.push_constants_dirty |=
3340 mesa_to_vk_shader_stage(stage);
3341
3342 range_start_reg += range->length;
3343 }
3344 }
3345 }
3346
3347 /* Resets the push constant state so that we allocate a new one if
3348 * needed.
3349 */
3350 gfx_state->base.push_constants_state = ANV_STATE_NULL;
3351
3352 anv_foreach_stage(stage, dirty_stages) {
3353 unsigned buffer_count = 0;
3354 flushed |= mesa_to_vk_shader_stage(stage);
3355 UNUSED uint32_t max_push_range = 0;
3356
3357 struct anv_address buffers[4] = {};
3358 if (anv_pipeline_has_stage(pipeline, stage)) {
3359 const struct anv_shader_bin *shader = pipeline->shaders[stage];
3360 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3361
3362 /* We have to gather buffer addresses as a second step because the
3363 * loop above puts data into the push constant area and the call to
3364 * get_push_range_address is what locks our push constants and copies
3365 * them into the actual GPU buffer. If we did the two loops at the
3366 * same time, we'd risk only having some of the sizes in the push
3367 * constant buffer when we did the copy.
3368 */
3369 for (unsigned i = 0; i < 4; i++) {
3370 const struct anv_push_range *range = &bind_map->push_ranges[i];
3371 if (range->length == 0)
3372 break;
3373
3374 buffers[i] = get_push_range_address(cmd_buffer, shader, range);
3375 max_push_range = MAX2(max_push_range, range->length);
3376 buffer_count++;
3377 }
3378
3379 /* We have at most 4 buffers but they should be tightly packed */
3380 for (unsigned i = buffer_count; i < 4; i++)
3381 assert(bind_map->push_ranges[i].length == 0);
3382 }
3383
3384 #if GFX_VER >= 12
3385 /* If this stage doesn't have any push constants, emit it later in a
3386 * single CONSTANT_ALL packet.
3387 */
3388 if (buffer_count == 0) {
3389 nobuffer_stages |= 1 << stage;
3390 continue;
3391 }
3392
3393 /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
3394 * contains only 5 bits, so we can only use it for buffers smaller than
3395 * 32.
3396 */
3397 if (max_push_range < 32) {
3398 cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
3399 buffers, buffer_count);
3400 continue;
3401 }
3402 #endif
3403
3404 cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
3405 }
3406
3407 #if GFX_VER >= 12
3408 if (nobuffer_stages)
3409 cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
3410 #endif
3411
3412 cmd_buffer->state.push_constants_dirty &= ~flushed;
3413 }
3414
3415 #if GFX_VERx10 >= 125
3416 static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)3417 cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
3418 VkShaderStageFlags dirty_stages)
3419 {
3420 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3421 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3422
3423 if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_NV &&
3424 anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
3425
3426 const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_TASK];
3427 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3428
3429 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
3430 const struct anv_push_range *range = &bind_map->push_ranges[0];
3431 if (range->length > 0) {
3432 struct anv_address buffer =
3433 get_push_range_address(cmd_buffer, shader, range);
3434
3435 uint64_t addr = anv_address_physical(buffer);
3436 data.InlineData[0] = addr & 0xffffffff;
3437 data.InlineData[1] = addr >> 32;
3438
3439 memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
3440 cmd_buffer->state.gfx.base.push_constants.client_data,
3441 BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
3442 }
3443 }
3444 }
3445
3446 if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_NV &&
3447 anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
3448
3449 const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_MESH];
3450 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3451
3452 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
3453 const struct anv_push_range *range = &bind_map->push_ranges[0];
3454 if (range->length > 0) {
3455 struct anv_address buffer =
3456 get_push_range_address(cmd_buffer, shader, range);
3457
3458 uint64_t addr = anv_address_physical(buffer);
3459 data.InlineData[0] = addr & 0xffffffff;
3460 data.InlineData[1] = addr >> 32;
3461
3462 memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
3463 cmd_buffer->state.gfx.base.push_constants.client_data,
3464 BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
3465 }
3466 }
3467 }
3468
3469 cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
3470 }
3471 #endif
3472
3473 static void
cmd_buffer_emit_clip(struct anv_cmd_buffer * cmd_buffer)3474 cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
3475 {
3476 const uint32_t clip_states =
3477 #if GFX_VER <= 7
3478 ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
3479 ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |
3480 #endif
3481 ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
3482 ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
3483 ANV_CMD_DIRTY_PIPELINE;
3484
3485 if ((cmd_buffer->state.gfx.dirty & clip_states) == 0)
3486 return;
3487
3488 /* Take dynamic primitive topology in to account with
3489 * 3DSTATE_CLIP::ViewportXYClipTestEnable
3490 */
3491 bool xy_clip_test_enable = 0;
3492
3493 if (cmd_buffer->state.gfx.pipeline->dynamic_states &
3494 ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
3495 VkPrimitiveTopology primitive_topology =
3496 cmd_buffer->state.gfx.dynamic.primitive_topology;
3497
3498 VkPolygonMode dynamic_raster_mode =
3499 genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
3500 primitive_topology);
3501
3502 xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
3503 }
3504
3505 #if GFX_VER <= 7
3506 const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
3507 #endif
3508 struct GENX(3DSTATE_CLIP) clip = {
3509 GENX(3DSTATE_CLIP_header),
3510 #if GFX_VER <= 7
3511 .FrontWinding = genX(vk_to_intel_front_face)[d->front_face],
3512 .CullMode = genX(vk_to_intel_cullmode)[d->cull_mode],
3513 #endif
3514 .ViewportXYClipTestEnable = xy_clip_test_enable,
3515 };
3516 uint32_t dwords[GENX(3DSTATE_CLIP_length)];
3517
3518 /* TODO(mesh): Multiview. */
3519 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3520 if (anv_pipeline_is_primitive(pipeline)) {
3521 const struct brw_vue_prog_data *last =
3522 anv_pipeline_get_last_vue_prog_data(pipeline);
3523 if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3524 clip.MaximumVPIndex =
3525 cmd_buffer->state.gfx.dynamic.viewport.count > 0 ?
3526 cmd_buffer->state.gfx.dynamic.viewport.count - 1 : 0;
3527 }
3528 } else if (anv_pipeline_is_mesh(pipeline)) {
3529 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
3530 if (mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
3531 uint32_t viewport_count = cmd_buffer->state.gfx.dynamic.viewport.count;
3532 clip.MaximumVPIndex = viewport_count > 0 ? viewport_count - 1 : 0;
3533 }
3534 }
3535
3536 GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
3537 anv_batch_emit_merge(&cmd_buffer->batch, dwords,
3538 pipeline->gfx7.clip);
3539 }
3540
3541 static void
cmd_buffer_emit_viewport(struct anv_cmd_buffer * cmd_buffer)3542 cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
3543 {
3544 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3545 uint32_t count = gfx->dynamic.viewport.count;
3546 const VkViewport *viewports = gfx->dynamic.viewport.viewports;
3547 struct anv_state sf_clip_state =
3548 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
3549
3550 bool negative_one_to_one =
3551 cmd_buffer->state.gfx.pipeline->negative_one_to_one;
3552
3553 float scale = negative_one_to_one ? 0.5f : 1.0f;
3554
3555 for (uint32_t i = 0; i < count; i++) {
3556 const VkViewport *vp = &viewports[i];
3557
3558 /* The gfx7 state struct has just the matrix and guardband fields, the
3559 * gfx8 struct adds the min/max viewport fields. */
3560 struct GENX(SF_CLIP_VIEWPORT) sfv = {
3561 .ViewportMatrixElementm00 = vp->width / 2,
3562 .ViewportMatrixElementm11 = vp->height / 2,
3563 .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
3564 .ViewportMatrixElementm30 = vp->x + vp->width / 2,
3565 .ViewportMatrixElementm31 = vp->y + vp->height / 2,
3566 .ViewportMatrixElementm32 = negative_one_to_one ?
3567 (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
3568 .XMinClipGuardband = -1.0f,
3569 .XMaxClipGuardband = 1.0f,
3570 .YMinClipGuardband = -1.0f,
3571 .YMaxClipGuardband = 1.0f,
3572 #if GFX_VER >= 8
3573 .XMinViewPort = vp->x,
3574 .XMaxViewPort = vp->x + vp->width - 1,
3575 .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
3576 .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
3577 #endif
3578 };
3579
3580 const uint32_t fb_size_max = 1 << 14;
3581 uint32_t x_min = 0, x_max = fb_size_max;
3582 uint32_t y_min = 0, y_max = fb_size_max;
3583
3584 /* If we have a valid renderArea, include that */
3585 if (gfx->render_area.extent.width > 0 &&
3586 gfx->render_area.extent.height > 0) {
3587 x_min = MAX2(x_min, gfx->render_area.offset.x);
3588 x_max = MIN2(x_min, gfx->render_area.offset.x +
3589 gfx->render_area.extent.width);
3590 y_min = MAX2(y_min, gfx->render_area.offset.y);
3591 y_max = MIN2(y_min, gfx->render_area.offset.y +
3592 gfx->render_area.extent.height);
3593 }
3594
3595 /* The client is required to have enough scissors for whatever it sets
3596 * as ViewportIndex but it's possible that they've got more viewports
3597 * set from a previous command. Also, from the Vulkan 1.3.207:
3598 *
3599 * "The application must ensure (using scissor if necessary) that
3600 * all rendering is contained within the render area."
3601 *
3602 * If the client doesn't set a scissor, that basically means it
3603 * guarantees everything is in-bounds already. If we end up using a
3604 * guardband of [-1, 1] in that case, there shouldn't be much loss.
3605 * It's theoretically possible that they could do all their clipping
3606 * with clip planes but that'd be a bit odd.
3607 */
3608 if (i < gfx->dynamic.scissor.count) {
3609 const VkRect2D *scissor = &gfx->dynamic.scissor.scissors[i];
3610 x_min = MAX2(x_min, scissor->offset.x);
3611 x_max = MIN2(x_min, scissor->offset.x + scissor->extent.width);
3612 y_min = MAX2(y_min, scissor->offset.y);
3613 y_max = MIN2(y_min, scissor->offset.y + scissor->extent.height);
3614 }
3615
3616 /* Only bother calculating the guardband if our known render area is
3617 * less than the maximum size. Otherwise, it will calculate [-1, 1]
3618 * anyway but possibly with precision loss.
3619 */
3620 if (x_min > 0 || x_max < fb_size_max ||
3621 y_min > 0 || y_max < fb_size_max) {
3622 intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
3623 sfv.ViewportMatrixElementm00,
3624 sfv.ViewportMatrixElementm11,
3625 sfv.ViewportMatrixElementm30,
3626 sfv.ViewportMatrixElementm31,
3627 &sfv.XMinClipGuardband,
3628 &sfv.XMaxClipGuardband,
3629 &sfv.YMinClipGuardband,
3630 &sfv.YMaxClipGuardband);
3631 }
3632
3633 GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
3634 }
3635
3636 anv_batch_emit(&cmd_buffer->batch,
3637 GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
3638 clip.SFClipViewportPointer = sf_clip_state.offset;
3639 }
3640 }
3641
3642 static void
cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer * cmd_buffer,bool depth_clamp_enable)3643 cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
3644 bool depth_clamp_enable)
3645 {
3646 uint32_t count = cmd_buffer->state.gfx.dynamic.viewport.count;
3647 const VkViewport *viewports =
3648 cmd_buffer->state.gfx.dynamic.viewport.viewports;
3649 struct anv_state cc_state =
3650 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
3651
3652 for (uint32_t i = 0; i < count; i++) {
3653 const VkViewport *vp = &viewports[i];
3654
3655 /* From the Vulkan spec:
3656 *
3657 * "It is valid for minDepth to be greater than or equal to
3658 * maxDepth."
3659 */
3660 float min_depth = MIN2(vp->minDepth, vp->maxDepth);
3661 float max_depth = MAX2(vp->minDepth, vp->maxDepth);
3662
3663 struct GENX(CC_VIEWPORT) cc_viewport = {
3664 .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
3665 .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
3666 };
3667
3668 GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
3669 }
3670
3671 anv_batch_emit(&cmd_buffer->batch,
3672 GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
3673 cc.CCViewportPointer = cc_state.offset;
3674 }
3675 }
3676
3677 static int64_t
clamp_int64(int64_t x,int64_t min,int64_t max)3678 clamp_int64(int64_t x, int64_t min, int64_t max)
3679 {
3680 if (x < min)
3681 return min;
3682 else if (x < max)
3683 return x;
3684 else
3685 return max;
3686 }
3687
3688 static void
cmd_buffer_emit_scissor(struct anv_cmd_buffer * cmd_buffer)3689 cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
3690 {
3691 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3692 uint32_t count = gfx->dynamic.scissor.count;
3693 const VkRect2D *scissors = gfx->dynamic.scissor.scissors;
3694 const VkViewport *viewports = gfx->dynamic.viewport.viewports;
3695
3696 /* Wa_1409725701:
3697 * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
3698 * stored as an array of up to 16 elements. The location of first
3699 * element of the array, as specified by Pointer to SCISSOR_RECT, should
3700 * be aligned to a 64-byte boundary.
3701 */
3702 uint32_t alignment = 64;
3703 struct anv_state scissor_state =
3704 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
3705
3706 for (uint32_t i = 0; i < count; i++) {
3707 const VkRect2D *s = &scissors[i];
3708 const VkViewport *vp = &viewports[i];
3709
3710 /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
3711 * ymax < ymin for empty clips. In case clip x, y, width height are all
3712 * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
3713 * what we want. Just special case empty clips and produce a canonical
3714 * empty clip. */
3715 static const struct GENX(SCISSOR_RECT) empty_scissor = {
3716 .ScissorRectangleYMin = 1,
3717 .ScissorRectangleXMin = 1,
3718 .ScissorRectangleYMax = 0,
3719 .ScissorRectangleXMax = 0
3720 };
3721
3722 const int max = 0xffff;
3723
3724 uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
3725 uint32_t x_min = MAX2(s->offset.x, vp->x);
3726 uint32_t y_max = MIN2(s->offset.y + s->extent.height - 1,
3727 MAX2(vp->y, vp->y + vp->height) - 1);
3728 uint32_t x_max = MIN2(s->offset.x + s->extent.width - 1,
3729 vp->x + vp->width - 1);
3730
3731 /* Do this math using int64_t so overflow gets clamped correctly. */
3732 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3733 y_min = clamp_int64((uint64_t) y_min, gfx->render_area.offset.y, max);
3734 x_min = clamp_int64((uint64_t) x_min, gfx->render_area.offset.x, max);
3735 y_max = clamp_int64((uint64_t) y_max, 0,
3736 gfx->render_area.offset.y +
3737 gfx->render_area.extent.height - 1);
3738 x_max = clamp_int64((uint64_t) x_max, 0,
3739 gfx->render_area.offset.x +
3740 gfx->render_area.extent.width - 1);
3741 }
3742
3743 struct GENX(SCISSOR_RECT) scissor = {
3744 .ScissorRectangleYMin = y_min,
3745 .ScissorRectangleXMin = x_min,
3746 .ScissorRectangleYMax = y_max,
3747 .ScissorRectangleXMax = x_max
3748 };
3749
3750 if (s->extent.width <= 0 || s->extent.height <= 0) {
3751 GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8,
3752 &empty_scissor);
3753 } else {
3754 GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
3755 }
3756 }
3757
3758 anv_batch_emit(&cmd_buffer->batch,
3759 GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
3760 ssp.ScissorRectPointer = scissor_state.offset;
3761 }
3762 }
3763
3764 static void
cmd_buffer_emit_streamout(struct anv_cmd_buffer * cmd_buffer)3765 cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
3766 {
3767 const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
3768 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3769
3770 #if GFX_VER == 7
3771 # define streamout_state_dw pipeline->gfx7.streamout_state
3772 #else
3773 # define streamout_state_dw pipeline->gfx8.streamout_state
3774 #endif
3775
3776 uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
3777
3778 struct GENX(3DSTATE_STREAMOUT) so = {
3779 GENX(3DSTATE_STREAMOUT_header),
3780 .RenderingDisable = d->raster_discard,
3781 };
3782 GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
3783 anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
3784 }
3785
3786 void
genX(cmd_buffer_flush_state)3787 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
3788 {
3789 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3790 uint32_t *p;
3791
3792 assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
3793
3794 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
3795
3796 genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
3797
3798 genX(flush_pipeline_select_3d)(cmd_buffer);
3799
3800 /* Apply any pending pipeline flushes we may have. We want to apply them
3801 * now because, if any of those flushes are for things like push constants,
3802 * the GPU will read the state at weird times.
3803 */
3804 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3805
3806 uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
3807 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3808 vb_emit |= pipeline->vb_used;
3809
3810 if (vb_emit) {
3811 const uint32_t num_buffers = __builtin_popcount(vb_emit);
3812 const uint32_t num_dwords = 1 + num_buffers * 4;
3813
3814 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3815 GENX(3DSTATE_VERTEX_BUFFERS));
3816 uint32_t i = 0;
3817 u_foreach_bit(vb, vb_emit) {
3818 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
3819 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
3820
3821 /* If dynamic, use stride/size from vertex binding, otherwise use
3822 * stride/size that was setup in the pipeline object.
3823 */
3824 bool dynamic_stride = cmd_buffer->state.gfx.dynamic.dyn_vbo_stride;
3825 bool dynamic_size = cmd_buffer->state.gfx.dynamic.dyn_vbo_size;
3826
3827 struct GENX(VERTEX_BUFFER_STATE) state;
3828 if (buffer) {
3829 uint32_t stride = dynamic_stride ?
3830 cmd_buffer->state.vertex_bindings[vb].stride : pipeline->vb[vb].stride;
3831 /* From the Vulkan spec (vkCmdBindVertexBuffers2EXT):
3832 *
3833 * "If pname:pSizes is not NULL then pname:pSizes[i] specifies
3834 * the bound size of the vertex buffer starting from the corresponding
3835 * elements of pname:pBuffers[i] plus pname:pOffsets[i]."
3836 */
3837 UNUSED uint32_t size = dynamic_size ?
3838 cmd_buffer->state.vertex_bindings[vb].size : buffer->size - offset;
3839
3840 state = (struct GENX(VERTEX_BUFFER_STATE)) {
3841 .VertexBufferIndex = vb,
3842
3843 .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
3844 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3845 #if GFX_VER <= 7
3846 .BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA,
3847 .InstanceDataStepRate = pipeline->vb[vb].instance_divisor,
3848 #endif
3849 .AddressModifyEnable = true,
3850 .BufferPitch = stride,
3851 .BufferStartingAddress = anv_address_add(buffer->address, offset),
3852 .NullVertexBuffer = offset >= buffer->size,
3853 #if GFX_VER >= 12
3854 .L3BypassDisable = true,
3855 #endif
3856
3857 #if GFX_VER >= 8
3858 .BufferSize = size,
3859 #else
3860 /* XXX: to handle dynamic offset for older gens we might want
3861 * to modify Endaddress, but there are issues when doing so:
3862 *
3863 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
3864 */
3865 .EndAddress = anv_address_add(buffer->address, buffer->size - 1),
3866 #endif
3867 };
3868 } else {
3869 state = (struct GENX(VERTEX_BUFFER_STATE)) {
3870 .VertexBufferIndex = vb,
3871 .NullVertexBuffer = true,
3872 .MOCS = anv_mocs(cmd_buffer->device, NULL,
3873 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3874 };
3875 }
3876
3877 #if GFX_VER >= 8 && GFX_VER <= 9
3878 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
3879 state.BufferStartingAddress,
3880 state.BufferSize);
3881 #endif
3882
3883 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
3884 i++;
3885 }
3886 }
3887
3888 cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
3889
3890 uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
3891 pipeline->active_stages;
3892 if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
3893 !cmd_buffer->state.push_constants_dirty)
3894 return;
3895
3896 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
3897 (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
3898 ANV_CMD_DIRTY_PIPELINE))) {
3899 /* Wa_16011411144:
3900 *
3901 * SW must insert a PIPE_CONTROL cmd before and after the
3902 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
3903 * state is not combined with other state changes.
3904 */
3905 if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
3906 anv_add_pending_pipe_bits(cmd_buffer,
3907 ANV_PIPE_CS_STALL_BIT,
3908 "before SO_BUFFER change WA");
3909 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3910 }
3911
3912 /* We don't need any per-buffer dirty tracking because you're not
3913 * allowed to bind different XFB buffers while XFB is enabled.
3914 */
3915 for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
3916 struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
3917 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
3918 #if GFX_VER < 12
3919 sob.SOBufferIndex = idx;
3920 #else
3921 sob._3DCommandOpcode = 0;
3922 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
3923 #endif
3924
3925 if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
3926 sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0);
3927 sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
3928 xfb->offset);
3929 #if GFX_VER >= 8
3930 sob.SOBufferEnable = true;
3931 sob.StreamOffsetWriteEnable = false;
3932 /* Size is in DWords - 1 */
3933 sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
3934 #else
3935 /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
3936 * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
3937 * default for an empty SO_BUFFER packet) to disable them.
3938 */
3939 sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
3940 sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
3941 xfb->offset + xfb->size);
3942 #endif
3943 } else {
3944 sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
3945 }
3946 }
3947 }
3948
3949 if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
3950 /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
3951 anv_add_pending_pipe_bits(cmd_buffer,
3952 ANV_PIPE_CS_STALL_BIT,
3953 "after SO_BUFFER change WA");
3954 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3955 } else if (GFX_VER >= 10) {
3956 /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
3957 anv_add_pending_pipe_bits(cmd_buffer,
3958 ANV_PIPE_CS_STALL_BIT,
3959 "after 3DSTATE_SO_BUFFER call");
3960 }
3961 }
3962
3963 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
3964 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
3965
3966 /* If the pipeline changed, we may need to re-allocate push constant
3967 * space in the URB.
3968 */
3969 cmd_buffer_alloc_push_constants(cmd_buffer);
3970 }
3971
3972 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3973 cmd_buffer->state.gfx.primitive_topology = pipeline->topology;
3974
3975 #if GFX_VER <= 7
3976 if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
3977 cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
3978 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
3979 *
3980 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
3981 * stall needs to be sent just prior to any 3DSTATE_VS,
3982 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
3983 * 3DSTATE_BINDING_TABLE_POINTER_VS,
3984 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one
3985 * PIPE_CONTROL needs to be sent before any combination of VS
3986 * associated 3DSTATE."
3987 */
3988 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3989 pc.DepthStallEnable = true;
3990 pc.PostSyncOperation = WriteImmediateData;
3991 pc.Address = cmd_buffer->device->workaround_address;
3992 anv_debug_dump_pc(pc);
3993 }
3994 }
3995 #endif
3996
3997 /* Render targets live in the same binding table as fragment descriptors */
3998 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
3999 descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
4000
4001 /* We emit the binding tables and sampler tables first, then emit push
4002 * constants and then finally emit binding table and sampler table
4003 * pointers. It has to happen in this order, since emitting the binding
4004 * tables may change the push constants (in case of storage images). After
4005 * emitting push constants, on SKL+ we have to emit the corresponding
4006 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
4007 */
4008 uint32_t dirty = 0;
4009 if (descriptors_dirty) {
4010 dirty = flush_descriptor_sets(cmd_buffer,
4011 &cmd_buffer->state.gfx.base,
4012 descriptors_dirty,
4013 pipeline->shaders,
4014 ARRAY_SIZE(pipeline->shaders));
4015 cmd_buffer->state.descriptors_dirty &= ~dirty;
4016 }
4017
4018 if (dirty || cmd_buffer->state.push_constants_dirty) {
4019 /* Because we're pushing UBOs, we have to push whenever either
4020 * descriptors or push constants is dirty.
4021 */
4022 dirty |= cmd_buffer->state.push_constants_dirty;
4023 cmd_buffer_flush_push_constants(cmd_buffer,
4024 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
4025 #if GFX_VERx10 >= 125
4026 cmd_buffer_flush_mesh_inline_data(
4027 cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_NV |
4028 VK_SHADER_STAGE_MESH_BIT_NV));
4029 #endif
4030 }
4031
4032 if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
4033 cmd_buffer_emit_descriptor_pointers(cmd_buffer,
4034 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
4035 }
4036
4037 cmd_buffer_emit_clip(cmd_buffer);
4038
4039 if (anv_cmd_buffer_needs_dynamic_state(cmd_buffer,
4040 ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE))
4041 cmd_buffer_emit_streamout(cmd_buffer);
4042
4043 if (anv_cmd_buffer_needs_dynamic_state(cmd_buffer,
4044 ANV_CMD_DIRTY_DYNAMIC_SCISSOR |
4045 ANV_CMD_DIRTY_RENDER_TARGETS |
4046 ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)) {
4047 cmd_buffer_emit_viewport(cmd_buffer);
4048 cmd_buffer_emit_depth_viewport(cmd_buffer,
4049 pipeline->depth_clamp_enable);
4050 }
4051
4052 if (anv_cmd_buffer_needs_dynamic_state(cmd_buffer,
4053 ANV_CMD_DIRTY_DYNAMIC_SCISSOR |
4054 ANV_CMD_DIRTY_RENDER_TARGETS |
4055 ANV_CMD_DIRTY_DYNAMIC_VIEWPORT))
4056 cmd_buffer_emit_scissor(cmd_buffer);
4057
4058 genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
4059 }
4060
4061 static void
emit_vertex_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,uint32_t size,uint32_t index)4062 emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
4063 struct anv_address addr,
4064 uint32_t size, uint32_t index)
4065 {
4066 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
4067 GENX(3DSTATE_VERTEX_BUFFERS));
4068
4069 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
4070 &(struct GENX(VERTEX_BUFFER_STATE)) {
4071 .VertexBufferIndex = index,
4072 .AddressModifyEnable = true,
4073 .BufferPitch = 0,
4074 .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
4075 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
4076 .NullVertexBuffer = size == 0,
4077 #if GFX_VER >= 12
4078 .L3BypassDisable = true,
4079 #endif
4080 #if (GFX_VER >= 8)
4081 .BufferStartingAddress = addr,
4082 .BufferSize = size
4083 #else
4084 .BufferStartingAddress = addr,
4085 .EndAddress = anv_address_add(addr, size),
4086 #endif
4087 });
4088
4089 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
4090 index, addr, size);
4091 }
4092
4093 static void
emit_base_vertex_instance_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)4094 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
4095 struct anv_address addr)
4096 {
4097 emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
4098 }
4099
4100 static void
emit_base_vertex_instance(struct anv_cmd_buffer * cmd_buffer,uint32_t base_vertex,uint32_t base_instance)4101 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
4102 uint32_t base_vertex, uint32_t base_instance)
4103 {
4104 if (base_vertex == 0 && base_instance == 0) {
4105 emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
4106 } else {
4107 struct anv_state id_state =
4108 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
4109
4110 ((uint32_t *)id_state.map)[0] = base_vertex;
4111 ((uint32_t *)id_state.map)[1] = base_instance;
4112
4113 struct anv_address addr = {
4114 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4115 .offset = id_state.offset,
4116 };
4117
4118 emit_base_vertex_instance_bo(cmd_buffer, addr);
4119 }
4120 }
4121
4122 static void
emit_draw_index(struct anv_cmd_buffer * cmd_buffer,uint32_t draw_index)4123 emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
4124 {
4125 struct anv_state state =
4126 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
4127
4128 ((uint32_t *)state.map)[0] = draw_index;
4129
4130 struct anv_address addr = {
4131 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4132 .offset = state.offset,
4133 };
4134
4135 emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
4136 }
4137
4138 static void
update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer * cmd_buffer,uint32_t access_type)4139 update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
4140 uint32_t access_type)
4141 {
4142 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4143 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4144
4145 uint64_t vb_used = pipeline->vb_used;
4146 if (vs_prog_data->uses_firstvertex ||
4147 vs_prog_data->uses_baseinstance)
4148 vb_used |= 1ull << ANV_SVGS_VB_INDEX;
4149 if (vs_prog_data->uses_drawid)
4150 vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
4151
4152 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
4153 access_type == RANDOM,
4154 vb_used);
4155 }
4156
4157 ALWAYS_INLINE static void
cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer * cmd_buffer,const struct brw_vs_prog_data * vs_prog_data,uint32_t base_vertex,uint32_t base_instance,uint32_t draw_id,bool force_flush)4158 cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
4159 const struct brw_vs_prog_data *vs_prog_data,
4160 uint32_t base_vertex,
4161 uint32_t base_instance,
4162 uint32_t draw_id,
4163 bool force_flush)
4164 {
4165 bool emitted = false;
4166 if (vs_prog_data->uses_firstvertex ||
4167 vs_prog_data->uses_baseinstance) {
4168 emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
4169 emitted = true;
4170 }
4171 if (vs_prog_data->uses_drawid) {
4172 emit_draw_index(cmd_buffer, draw_id);
4173 emitted = true;
4174 }
4175 /* Emitting draw index or vertex index BOs may result in needing
4176 * additional VF cache flushes.
4177 */
4178 if (emitted || force_flush)
4179 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4180 }
4181
4182 static unsigned
anv_cmd_buffer_get_view_count(struct anv_cmd_buffer * cmd_buffer)4183 anv_cmd_buffer_get_view_count(struct anv_cmd_buffer *cmd_buffer)
4184 {
4185 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4186 return MAX2(1, util_bitcount(gfx->view_mask));
4187 }
4188
genX(CmdDraw)4189 void genX(CmdDraw)(
4190 VkCommandBuffer commandBuffer,
4191 uint32_t vertexCount,
4192 uint32_t instanceCount,
4193 uint32_t firstVertex,
4194 uint32_t firstInstance)
4195 {
4196 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4197 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4198 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4199
4200 if (anv_batch_has_error(&cmd_buffer->batch))
4201 return;
4202
4203 const uint32_t count = (vertexCount *
4204 instanceCount *
4205 (pipeline->use_primitive_replication ?
4206 1 : anv_cmd_buffer_get_view_count(cmd_buffer)));
4207 anv_measure_snapshot(cmd_buffer,
4208 INTEL_SNAPSHOT_DRAW,
4209 "draw", count);
4210 trace_intel_begin_draw(&cmd_buffer->trace, cmd_buffer);
4211
4212 genX(cmd_buffer_flush_state)(cmd_buffer);
4213
4214 if (cmd_buffer->state.conditional_render_enabled)
4215 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4216
4217 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4218 firstVertex, firstInstance, 0,
4219 true);
4220
4221 /* Our implementation of VK_KHR_multiview uses instancing to draw the
4222 * different views. We need to multiply instanceCount by the view count.
4223 */
4224 if (!pipeline->use_primitive_replication)
4225 instanceCount *= anv_cmd_buffer_get_view_count(cmd_buffer);
4226
4227 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4228 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4229 prim.VertexAccessType = SEQUENTIAL;
4230 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4231 prim.VertexCountPerInstance = vertexCount;
4232 prim.StartVertexLocation = firstVertex;
4233 prim.InstanceCount = instanceCount;
4234 prim.StartInstanceLocation = firstInstance;
4235 prim.BaseVertexLocation = 0;
4236 }
4237
4238 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4239
4240 trace_intel_end_draw(&cmd_buffer->trace, cmd_buffer, count);
4241 }
4242
genX(CmdDrawMultiEXT)4243 void genX(CmdDrawMultiEXT)(
4244 VkCommandBuffer commandBuffer,
4245 uint32_t drawCount,
4246 const VkMultiDrawInfoEXT *pVertexInfo,
4247 uint32_t instanceCount,
4248 uint32_t firstInstance,
4249 uint32_t stride)
4250 {
4251 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4252 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4253 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4254
4255 if (anv_batch_has_error(&cmd_buffer->batch))
4256 return;
4257
4258 const uint32_t count = (drawCount *
4259 instanceCount *
4260 (pipeline->use_primitive_replication ?
4261 1 : anv_cmd_buffer_get_view_count(cmd_buffer)));
4262 anv_measure_snapshot(cmd_buffer,
4263 INTEL_SNAPSHOT_DRAW,
4264 "draw_multi", count);
4265 trace_intel_begin_draw_multi(&cmd_buffer->trace, cmd_buffer);
4266
4267 genX(cmd_buffer_flush_state)(cmd_buffer);
4268
4269 if (cmd_buffer->state.conditional_render_enabled)
4270 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4271
4272 /* Our implementation of VK_KHR_multiview uses instancing to draw the
4273 * different views. We need to multiply instanceCount by the view count.
4274 */
4275 if (!pipeline->use_primitive_replication)
4276 instanceCount *= anv_cmd_buffer_get_view_count(cmd_buffer);
4277
4278 uint32_t i = 0;
4279 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
4280 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4281 draw->firstVertex,
4282 firstInstance, i, !i);
4283
4284 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4285 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4286 prim.VertexAccessType = SEQUENTIAL;
4287 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4288 prim.VertexCountPerInstance = draw->vertexCount;
4289 prim.StartVertexLocation = draw->firstVertex;
4290 prim.InstanceCount = instanceCount;
4291 prim.StartInstanceLocation = firstInstance;
4292 prim.BaseVertexLocation = 0;
4293 }
4294 }
4295
4296 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4297
4298 trace_intel_end_draw_multi(&cmd_buffer->trace, cmd_buffer, count);
4299 }
4300
genX(CmdDrawIndexed)4301 void genX(CmdDrawIndexed)(
4302 VkCommandBuffer commandBuffer,
4303 uint32_t indexCount,
4304 uint32_t instanceCount,
4305 uint32_t firstIndex,
4306 int32_t vertexOffset,
4307 uint32_t firstInstance)
4308 {
4309 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4310 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4311 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4312
4313 if (anv_batch_has_error(&cmd_buffer->batch))
4314 return;
4315
4316 const uint32_t count = (indexCount *
4317 instanceCount *
4318 (pipeline->use_primitive_replication ?
4319 1 : anv_cmd_buffer_get_view_count(cmd_buffer)));
4320 anv_measure_snapshot(cmd_buffer,
4321 INTEL_SNAPSHOT_DRAW,
4322 "draw indexed",
4323 count);
4324 trace_intel_begin_draw_indexed(&cmd_buffer->trace, cmd_buffer);
4325
4326 genX(cmd_buffer_flush_state)(cmd_buffer);
4327
4328 if (cmd_buffer->state.conditional_render_enabled)
4329 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4330
4331 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true);
4332
4333 /* Our implementation of VK_KHR_multiview uses instancing to draw the
4334 * different views. We need to multiply instanceCount by the view count.
4335 */
4336 if (!pipeline->use_primitive_replication)
4337 instanceCount *= anv_cmd_buffer_get_view_count(cmd_buffer);
4338
4339 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4340 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4341 prim.VertexAccessType = RANDOM;
4342 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4343 prim.VertexCountPerInstance = indexCount;
4344 prim.StartVertexLocation = firstIndex;
4345 prim.InstanceCount = instanceCount;
4346 prim.StartInstanceLocation = firstInstance;
4347 prim.BaseVertexLocation = vertexOffset;
4348 }
4349
4350 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4351
4352 trace_intel_end_draw_indexed(&cmd_buffer->trace, cmd_buffer, count);
4353 }
4354
genX(CmdDrawMultiIndexedEXT)4355 void genX(CmdDrawMultiIndexedEXT)(
4356 VkCommandBuffer commandBuffer,
4357 uint32_t drawCount,
4358 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
4359 uint32_t instanceCount,
4360 uint32_t firstInstance,
4361 uint32_t stride,
4362 const int32_t *pVertexOffset)
4363 {
4364 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4365 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4366 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4367
4368 if (anv_batch_has_error(&cmd_buffer->batch))
4369 return;
4370
4371 const uint32_t count = (drawCount *
4372 instanceCount *
4373 (pipeline->use_primitive_replication ?
4374 1 : anv_cmd_buffer_get_view_count(cmd_buffer)));
4375 anv_measure_snapshot(cmd_buffer,
4376 INTEL_SNAPSHOT_DRAW,
4377 "draw indexed_multi",
4378 count);
4379 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace, cmd_buffer);
4380
4381 genX(cmd_buffer_flush_state)(cmd_buffer);
4382
4383 if (cmd_buffer->state.conditional_render_enabled)
4384 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4385
4386 /* Our implementation of VK_KHR_multiview uses instancing to draw the
4387 * different views. We need to multiply instanceCount by the view count.
4388 */
4389 if (!pipeline->use_primitive_replication)
4390 instanceCount *= anv_cmd_buffer_get_view_count(cmd_buffer);
4391
4392 uint32_t i = 0;
4393 if (pVertexOffset) {
4394 if (vs_prog_data->uses_drawid) {
4395 bool emitted = true;
4396 if (vs_prog_data->uses_firstvertex ||
4397 vs_prog_data->uses_baseinstance) {
4398 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
4399 emitted = true;
4400 }
4401 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4402 if (vs_prog_data->uses_drawid) {
4403 emit_draw_index(cmd_buffer, i);
4404 emitted = true;
4405 }
4406 /* Emitting draw index or vertex index BOs may result in needing
4407 * additional VF cache flushes.
4408 */
4409 if (emitted)
4410 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4411
4412 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4413 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4414 prim.VertexAccessType = RANDOM;
4415 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4416 prim.VertexCountPerInstance = draw->indexCount;
4417 prim.StartVertexLocation = draw->firstIndex;
4418 prim.InstanceCount = instanceCount;
4419 prim.StartInstanceLocation = firstInstance;
4420 prim.BaseVertexLocation = *pVertexOffset;
4421 }
4422 emitted = false;
4423 }
4424 } else {
4425 if (vs_prog_data->uses_firstvertex ||
4426 vs_prog_data->uses_baseinstance) {
4427 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
4428 /* Emitting draw index or vertex index BOs may result in needing
4429 * additional VF cache flushes.
4430 */
4431 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4432 }
4433 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4434 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4435 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4436 prim.VertexAccessType = RANDOM;
4437 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4438 prim.VertexCountPerInstance = draw->indexCount;
4439 prim.StartVertexLocation = draw->firstIndex;
4440 prim.InstanceCount = instanceCount;
4441 prim.StartInstanceLocation = firstInstance;
4442 prim.BaseVertexLocation = *pVertexOffset;
4443 }
4444 }
4445 }
4446 } else {
4447 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4448 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4449 draw->vertexOffset,
4450 firstInstance, i, i != 0);
4451
4452 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4453 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4454 prim.VertexAccessType = RANDOM;
4455 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4456 prim.VertexCountPerInstance = draw->indexCount;
4457 prim.StartVertexLocation = draw->firstIndex;
4458 prim.InstanceCount = instanceCount;
4459 prim.StartInstanceLocation = firstInstance;
4460 prim.BaseVertexLocation = draw->vertexOffset;
4461 }
4462 }
4463 }
4464
4465 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4466
4467 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, cmd_buffer, count);
4468 }
4469
4470 /* Auto-Draw / Indirect Registers */
4471 #define GFX7_3DPRIM_END_OFFSET 0x2420
4472 #define GFX7_3DPRIM_START_VERTEX 0x2430
4473 #define GFX7_3DPRIM_VERTEX_COUNT 0x2434
4474 #define GFX7_3DPRIM_INSTANCE_COUNT 0x2438
4475 #define GFX7_3DPRIM_START_INSTANCE 0x243C
4476 #define GFX7_3DPRIM_BASE_VERTEX 0x2440
4477
genX(CmdDrawIndirectByteCountEXT)4478 void genX(CmdDrawIndirectByteCountEXT)(
4479 VkCommandBuffer commandBuffer,
4480 uint32_t instanceCount,
4481 uint32_t firstInstance,
4482 VkBuffer counterBuffer,
4483 VkDeviceSize counterBufferOffset,
4484 uint32_t counterOffset,
4485 uint32_t vertexStride)
4486 {
4487 #if GFX_VERx10 >= 75
4488 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4489 ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
4490 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4491 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4492
4493 /* firstVertex is always zero for this draw function */
4494 const uint32_t firstVertex = 0;
4495
4496 if (anv_batch_has_error(&cmd_buffer->batch))
4497 return;
4498
4499 anv_measure_snapshot(cmd_buffer,
4500 INTEL_SNAPSHOT_DRAW,
4501 "draw indirect byte count",
4502 instanceCount);
4503 trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace, cmd_buffer);
4504
4505 genX(cmd_buffer_flush_state)(cmd_buffer);
4506
4507 if (cmd_buffer->state.conditional_render_enabled)
4508 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4509
4510 if (vs_prog_data->uses_firstvertex ||
4511 vs_prog_data->uses_baseinstance)
4512 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
4513 if (vs_prog_data->uses_drawid)
4514 emit_draw_index(cmd_buffer, 0);
4515
4516 /* Emitting draw index or vertex index BOs may result in needing
4517 * additional VF cache flushes.
4518 */
4519 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4520
4521 /* Our implementation of VK_KHR_multiview uses instancing to draw the
4522 * different views. We need to multiply instanceCount by the view count.
4523 */
4524 if (!pipeline->use_primitive_replication)
4525 instanceCount *= anv_cmd_buffer_get_view_count(cmd_buffer);
4526
4527 struct mi_builder b;
4528 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4529 struct mi_value count =
4530 mi_mem32(anv_address_add(counter_buffer->address,
4531 counterBufferOffset));
4532 if (counterOffset)
4533 count = mi_isub(&b, count, mi_imm(counterOffset));
4534 count = mi_udiv32_imm(&b, count, vertexStride);
4535 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
4536
4537 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
4538 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), mi_imm(instanceCount));
4539 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
4540 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
4541
4542 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4543 prim.IndirectParameterEnable = true;
4544 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4545 prim.VertexAccessType = SEQUENTIAL;
4546 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4547 }
4548
4549 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4550
4551 trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace, cmd_buffer,
4552 instanceCount);
4553 #endif /* GFX_VERx10 >= 75 */
4554 }
4555
4556 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed)4557 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
4558 struct anv_address addr,
4559 bool indexed)
4560 {
4561 struct mi_builder b;
4562 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4563
4564 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
4565 mi_mem32(anv_address_add(addr, 0)));
4566
4567 struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
4568 unsigned view_count = anv_cmd_buffer_get_view_count(cmd_buffer);
4569 if (view_count > 1) {
4570 #if GFX_VERx10 >= 75
4571 instance_count = mi_imul_imm(&b, instance_count, view_count);
4572 #else
4573 anv_finishme("Multiview + indirect draw requires MI_MATH; "
4574 "MI_MATH is not supported on Ivy Bridge");
4575 #endif
4576 }
4577 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
4578
4579 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
4580 mi_mem32(anv_address_add(addr, 8)));
4581
4582 if (indexed) {
4583 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
4584 mi_mem32(anv_address_add(addr, 12)));
4585 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
4586 mi_mem32(anv_address_add(addr, 16)));
4587 } else {
4588 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
4589 mi_mem32(anv_address_add(addr, 12)));
4590 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
4591 }
4592 }
4593
genX(CmdDrawIndirect)4594 void genX(CmdDrawIndirect)(
4595 VkCommandBuffer commandBuffer,
4596 VkBuffer _buffer,
4597 VkDeviceSize offset,
4598 uint32_t drawCount,
4599 uint32_t stride)
4600 {
4601 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4602 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4603 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4604 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4605
4606 if (anv_batch_has_error(&cmd_buffer->batch))
4607 return;
4608
4609 anv_measure_snapshot(cmd_buffer,
4610 INTEL_SNAPSHOT_DRAW,
4611 "draw indirect",
4612 drawCount);
4613 trace_intel_begin_draw_indirect(&cmd_buffer->trace, cmd_buffer);
4614
4615 genX(cmd_buffer_flush_state)(cmd_buffer);
4616
4617 if (cmd_buffer->state.conditional_render_enabled)
4618 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4619
4620 for (uint32_t i = 0; i < drawCount; i++) {
4621 struct anv_address draw = anv_address_add(buffer->address, offset);
4622
4623 if (vs_prog_data->uses_firstvertex ||
4624 vs_prog_data->uses_baseinstance)
4625 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4626 if (vs_prog_data->uses_drawid)
4627 emit_draw_index(cmd_buffer, i);
4628
4629 /* Emitting draw index or vertex index BOs may result in needing
4630 * additional VF cache flushes.
4631 */
4632 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4633
4634 load_indirect_parameters(cmd_buffer, draw, false);
4635
4636 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4637 prim.IndirectParameterEnable = true;
4638 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4639 prim.VertexAccessType = SEQUENTIAL;
4640 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4641 }
4642
4643 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4644
4645 offset += stride;
4646 }
4647
4648 trace_intel_end_draw_indirect(&cmd_buffer->trace, cmd_buffer, drawCount);
4649 }
4650
genX(CmdDrawIndexedIndirect)4651 void genX(CmdDrawIndexedIndirect)(
4652 VkCommandBuffer commandBuffer,
4653 VkBuffer _buffer,
4654 VkDeviceSize offset,
4655 uint32_t drawCount,
4656 uint32_t stride)
4657 {
4658 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4659 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4660 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4661 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4662
4663 if (anv_batch_has_error(&cmd_buffer->batch))
4664 return;
4665
4666 anv_measure_snapshot(cmd_buffer,
4667 INTEL_SNAPSHOT_DRAW,
4668 "draw indexed indirect",
4669 drawCount);
4670 trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace, cmd_buffer);
4671
4672 genX(cmd_buffer_flush_state)(cmd_buffer);
4673
4674 if (cmd_buffer->state.conditional_render_enabled)
4675 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4676
4677 for (uint32_t i = 0; i < drawCount; i++) {
4678 struct anv_address draw = anv_address_add(buffer->address, offset);
4679
4680 /* TODO: We need to stomp base vertex to 0 somehow */
4681 if (vs_prog_data->uses_firstvertex ||
4682 vs_prog_data->uses_baseinstance)
4683 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4684 if (vs_prog_data->uses_drawid)
4685 emit_draw_index(cmd_buffer, i);
4686
4687 /* Emitting draw index or vertex index BOs may result in needing
4688 * additional VF cache flushes.
4689 */
4690 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4691
4692 load_indirect_parameters(cmd_buffer, draw, true);
4693
4694 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4695 prim.IndirectParameterEnable = true;
4696 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4697 prim.VertexAccessType = RANDOM;
4698 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4699 }
4700
4701 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4702
4703 offset += stride;
4704 }
4705
4706 trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, cmd_buffer, drawCount);
4707 }
4708
4709 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_buffer * count_buffer,uint64_t countBufferOffset)4710 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4711 struct mi_builder *b,
4712 struct anv_buffer *count_buffer,
4713 uint64_t countBufferOffset)
4714 {
4715 struct anv_address count_address =
4716 anv_address_add(count_buffer->address, countBufferOffset);
4717
4718 struct mi_value ret = mi_imm(0);
4719
4720 if (cmd_buffer->state.conditional_render_enabled) {
4721 #if GFX_VERx10 >= 75
4722 ret = mi_new_gpr(b);
4723 mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
4724 #endif
4725 } else {
4726 /* Upload the current draw count from the draw parameters buffer to
4727 * MI_PREDICATE_SRC0.
4728 */
4729 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
4730 mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
4731 }
4732
4733 return ret;
4734 }
4735
4736 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)4737 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4738 struct mi_builder *b,
4739 uint32_t draw_index)
4740 {
4741 /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
4742 mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
4743
4744 if (draw_index == 0) {
4745 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4746 mip.LoadOperation = LOAD_LOADINV;
4747 mip.CombineOperation = COMBINE_SET;
4748 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4749 }
4750 } else {
4751 /* While draw_index < draw_count the predicate's result will be
4752 * (draw_index == draw_count) ^ TRUE = TRUE
4753 * When draw_index == draw_count the result is
4754 * (TRUE) ^ TRUE = FALSE
4755 * After this all results will be:
4756 * (FALSE) ^ FALSE = FALSE
4757 */
4758 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4759 mip.LoadOperation = LOAD_LOAD;
4760 mip.CombineOperation = COMBINE_XOR;
4761 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4762 }
4763 }
4764 }
4765
4766 #if GFX_VERx10 >= 75
4767 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4768 emit_draw_count_predicate_with_conditional_render(
4769 struct anv_cmd_buffer *cmd_buffer,
4770 struct mi_builder *b,
4771 uint32_t draw_index,
4772 struct mi_value max)
4773 {
4774 struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
4775 pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
4776
4777 #if GFX_VER >= 8
4778 mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
4779 #else
4780 /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
4781 * so we emit MI_PREDICATE to set it.
4782 */
4783
4784 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
4785 mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4786
4787 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4788 mip.LoadOperation = LOAD_LOADINV;
4789 mip.CombineOperation = COMBINE_SET;
4790 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4791 }
4792 #endif
4793 }
4794 #endif
4795
4796 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4797 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
4798 struct mi_builder *b,
4799 uint32_t draw_index,
4800 struct mi_value max)
4801 {
4802 #if GFX_VERx10 >= 75
4803 if (cmd_buffer->state.conditional_render_enabled) {
4804 emit_draw_count_predicate_with_conditional_render(
4805 cmd_buffer, b, draw_index, mi_value_ref(b, max));
4806 } else {
4807 emit_draw_count_predicate(cmd_buffer, b, draw_index);
4808 }
4809 #else
4810 emit_draw_count_predicate(cmd_buffer, b, draw_index);
4811 #endif
4812 }
4813
genX(CmdDrawIndirectCount)4814 void genX(CmdDrawIndirectCount)(
4815 VkCommandBuffer commandBuffer,
4816 VkBuffer _buffer,
4817 VkDeviceSize offset,
4818 VkBuffer _countBuffer,
4819 VkDeviceSize countBufferOffset,
4820 uint32_t maxDrawCount,
4821 uint32_t stride)
4822 {
4823 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4824 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4825 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4826 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4827 struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4828 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4829
4830 if (anv_batch_has_error(&cmd_buffer->batch))
4831 return;
4832
4833 anv_measure_snapshot(cmd_buffer,
4834 INTEL_SNAPSHOT_DRAW,
4835 "draw indirect count",
4836 0);
4837 trace_intel_begin_draw_indirect_count(&cmd_buffer->trace, cmd_buffer);
4838
4839 genX(cmd_buffer_flush_state)(cmd_buffer);
4840
4841 struct mi_builder b;
4842 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4843 struct mi_value max =
4844 prepare_for_draw_count_predicate(cmd_buffer, &b,
4845 count_buffer, countBufferOffset);
4846
4847 for (uint32_t i = 0; i < maxDrawCount; i++) {
4848 struct anv_address draw = anv_address_add(buffer->address, offset);
4849
4850 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4851
4852 if (vs_prog_data->uses_firstvertex ||
4853 vs_prog_data->uses_baseinstance)
4854 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4855 if (vs_prog_data->uses_drawid)
4856 emit_draw_index(cmd_buffer, i);
4857
4858 /* Emitting draw index or vertex index BOs may result in needing
4859 * additional VF cache flushes.
4860 */
4861 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4862
4863 load_indirect_parameters(cmd_buffer, draw, false);
4864
4865 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4866 prim.IndirectParameterEnable = true;
4867 prim.PredicateEnable = true;
4868 prim.VertexAccessType = SEQUENTIAL;
4869 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4870 }
4871
4872 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4873
4874 offset += stride;
4875 }
4876
4877 mi_value_unref(&b, max);
4878
4879 trace_intel_end_draw_indirect_count(&cmd_buffer->trace, cmd_buffer, maxDrawCount);
4880 }
4881
genX(CmdDrawIndexedIndirectCount)4882 void genX(CmdDrawIndexedIndirectCount)(
4883 VkCommandBuffer commandBuffer,
4884 VkBuffer _buffer,
4885 VkDeviceSize offset,
4886 VkBuffer _countBuffer,
4887 VkDeviceSize countBufferOffset,
4888 uint32_t maxDrawCount,
4889 uint32_t stride)
4890 {
4891 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4892 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4893 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4894 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4895 struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4896 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4897
4898 if (anv_batch_has_error(&cmd_buffer->batch))
4899 return;
4900
4901 anv_measure_snapshot(cmd_buffer,
4902 INTEL_SNAPSHOT_DRAW,
4903 "draw indexed indirect count",
4904 0);
4905 trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace, cmd_buffer);
4906
4907 genX(cmd_buffer_flush_state)(cmd_buffer);
4908
4909 struct mi_builder b;
4910 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4911 struct mi_value max =
4912 prepare_for_draw_count_predicate(cmd_buffer, &b,
4913 count_buffer, countBufferOffset);
4914
4915 for (uint32_t i = 0; i < maxDrawCount; i++) {
4916 struct anv_address draw = anv_address_add(buffer->address, offset);
4917
4918 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4919
4920 /* TODO: We need to stomp base vertex to 0 somehow */
4921 if (vs_prog_data->uses_firstvertex ||
4922 vs_prog_data->uses_baseinstance)
4923 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4924 if (vs_prog_data->uses_drawid)
4925 emit_draw_index(cmd_buffer, i);
4926
4927 /* Emitting draw index or vertex index BOs may result in needing
4928 * additional VF cache flushes.
4929 */
4930 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4931
4932 load_indirect_parameters(cmd_buffer, draw, true);
4933
4934 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4935 prim.IndirectParameterEnable = true;
4936 prim.PredicateEnable = true;
4937 prim.VertexAccessType = RANDOM;
4938 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4939 }
4940
4941 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4942
4943 offset += stride;
4944 }
4945
4946 mi_value_unref(&b, max);
4947
4948 trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace,
4949 cmd_buffer, maxDrawCount);
4950
4951 }
4952
genX(CmdBeginTransformFeedbackEXT)4953 void genX(CmdBeginTransformFeedbackEXT)(
4954 VkCommandBuffer commandBuffer,
4955 uint32_t firstCounterBuffer,
4956 uint32_t counterBufferCount,
4957 const VkBuffer* pCounterBuffers,
4958 const VkDeviceSize* pCounterBufferOffsets)
4959 {
4960 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4961
4962 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4963 assert(counterBufferCount <= MAX_XFB_BUFFERS);
4964 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4965
4966 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4967 *
4968 * "Ssoftware must ensure that no HW stream output operations can be in
4969 * process or otherwise pending at the point that the MI_LOAD/STORE
4970 * commands are processed. This will likely require a pipeline flush."
4971 */
4972 anv_add_pending_pipe_bits(cmd_buffer,
4973 ANV_PIPE_CS_STALL_BIT,
4974 "begin transform feedback");
4975 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4976
4977 for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
4978 /* If we have a counter buffer, this is a resume so we need to load the
4979 * value into the streamout offset register. Otherwise, this is a begin
4980 * and we need to reset it to zero.
4981 */
4982 if (pCounterBuffers &&
4983 idx >= firstCounterBuffer &&
4984 idx - firstCounterBuffer < counterBufferCount &&
4985 pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
4986 uint32_t cb_idx = idx - firstCounterBuffer;
4987 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4988 uint64_t offset = pCounterBufferOffsets ?
4989 pCounterBufferOffsets[cb_idx] : 0;
4990
4991 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
4992 lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4993 lrm.MemoryAddress = anv_address_add(counter_buffer->address,
4994 offset);
4995 }
4996 } else {
4997 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
4998 lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4999 lri.DataDWord = 0;
5000 }
5001 }
5002 }
5003
5004 cmd_buffer->state.xfb_enabled = true;
5005 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
5006 }
5007
genX(CmdEndTransformFeedbackEXT)5008 void genX(CmdEndTransformFeedbackEXT)(
5009 VkCommandBuffer commandBuffer,
5010 uint32_t firstCounterBuffer,
5011 uint32_t counterBufferCount,
5012 const VkBuffer* pCounterBuffers,
5013 const VkDeviceSize* pCounterBufferOffsets)
5014 {
5015 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5016
5017 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
5018 assert(counterBufferCount <= MAX_XFB_BUFFERS);
5019 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
5020
5021 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
5022 *
5023 * "Ssoftware must ensure that no HW stream output operations can be in
5024 * process or otherwise pending at the point that the MI_LOAD/STORE
5025 * commands are processed. This will likely require a pipeline flush."
5026 */
5027 anv_add_pending_pipe_bits(cmd_buffer,
5028 ANV_PIPE_CS_STALL_BIT,
5029 "end transform feedback");
5030 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5031
5032 for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
5033 unsigned idx = firstCounterBuffer + cb_idx;
5034
5035 /* If we have a counter buffer, this is a resume so we need to load the
5036 * value into the streamout offset register. Otherwise, this is a begin
5037 * and we need to reset it to zero.
5038 */
5039 if (pCounterBuffers &&
5040 cb_idx < counterBufferCount &&
5041 pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
5042 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
5043 uint64_t offset = pCounterBufferOffsets ?
5044 pCounterBufferOffsets[cb_idx] : 0;
5045
5046 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
5047 srm.MemoryAddress = anv_address_add(counter_buffer->address,
5048 offset);
5049 srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
5050 }
5051 }
5052 }
5053
5054 cmd_buffer->state.xfb_enabled = false;
5055 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
5056 }
5057
5058 #if GFX_VERx10 >= 125
5059 void
genX(CmdDrawMeshTasksNV)5060 genX(CmdDrawMeshTasksNV)(
5061 VkCommandBuffer commandBuffer,
5062 uint32_t taskCount,
5063 uint32_t firstTask)
5064 {
5065 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5066
5067 if (anv_batch_has_error(&cmd_buffer->batch))
5068 return;
5069
5070 /* TODO(mesh): Check if this is not emitting more packets than we need. */
5071 genX(cmd_buffer_flush_state)(cmd_buffer);
5072
5073 if (cmd_buffer->state.conditional_render_enabled)
5074 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5075
5076 /* BSpec 54016 says: "The values passed for Starting ThreadGroup ID X
5077 * and ThreadGroup Count X shall not cause TGIDs to exceed (2^32)-1."
5078 */
5079 assert((int64_t)firstTask + taskCount - 1 <= UINT32_MAX);
5080
5081 anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_1D), m) {
5082 m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
5083 m.ThreadGroupCountX = taskCount;
5084 m.StartingThreadGroupIDX = firstTask;
5085 }
5086 }
5087
5088 #define GFX125_3DMESH_TG_COUNT 0x26F0
5089 #define GFX125_3DMESH_STARTING_TGID 0x26F4
5090 #define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
5091
5092 static void
mesh_load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address addr,bool emit_xp0,uint32_t xp0)5093 mesh_load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
5094 struct mi_builder *b,
5095 struct anv_address addr,
5096 bool emit_xp0,
5097 uint32_t xp0)
5098 {
5099 const size_t taskCountOff = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
5100 const size_t firstTaskOff = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask);
5101
5102 mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
5103 mi_mem32(anv_address_add(addr, taskCountOff)));
5104
5105 mi_store(b, mi_reg32(GFX125_3DMESH_STARTING_TGID),
5106 mi_mem32(anv_address_add(addr, firstTaskOff)));
5107
5108 if (emit_xp0)
5109 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
5110 }
5111
5112 static void
emit_indirect_3dmesh_1d(struct anv_batch * batch,bool predicate_enable,bool uses_drawid)5113 emit_indirect_3dmesh_1d(struct anv_batch *batch,
5114 bool predicate_enable,
5115 bool uses_drawid)
5116 {
5117 uint32_t len = GENX(3DMESH_1D_length) + uses_drawid;
5118 anv_batch_emitn(batch, len, GENX(3DMESH_1D),
5119 .PredicateEnable = predicate_enable,
5120 .IndirectParameterEnable = true,
5121 .ExtendedParameter0Present = uses_drawid);
5122 }
5123
5124 void
genX(CmdDrawMeshTasksIndirectNV)5125 genX(CmdDrawMeshTasksIndirectNV)(
5126 VkCommandBuffer commandBuffer,
5127 VkBuffer _buffer,
5128 VkDeviceSize offset,
5129 uint32_t drawCount,
5130 uint32_t stride)
5131 {
5132 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5133 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5134 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
5135 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
5136 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
5137 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5138
5139 if (anv_batch_has_error(&cmd_buffer->batch))
5140 return;
5141
5142 genX(cmd_buffer_flush_state)(cmd_buffer);
5143
5144 if (cmd_state->conditional_render_enabled)
5145 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5146
5147 bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
5148 mesh_prog_data->uses_drawid;
5149 struct mi_builder b;
5150 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5151
5152 for (uint32_t i = 0; i < drawCount; i++) {
5153 struct anv_address draw = anv_address_add(buffer->address, offset);
5154
5155 mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i);
5156
5157 emit_indirect_3dmesh_1d(&cmd_buffer->batch,
5158 cmd_state->conditional_render_enabled, uses_drawid);
5159
5160 offset += stride;
5161 }
5162 }
5163
5164 void
genX(CmdDrawMeshTasksIndirectCountNV)5165 genX(CmdDrawMeshTasksIndirectCountNV)(
5166 VkCommandBuffer commandBuffer,
5167 VkBuffer _buffer,
5168 VkDeviceSize offset,
5169 VkBuffer _countBuffer,
5170 VkDeviceSize countBufferOffset,
5171 uint32_t maxDrawCount,
5172 uint32_t stride)
5173 {
5174 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5175 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5176 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
5177 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
5178 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
5179 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
5180
5181 if (anv_batch_has_error(&cmd_buffer->batch))
5182 return;
5183
5184 genX(cmd_buffer_flush_state)(cmd_buffer);
5185
5186 bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
5187 mesh_prog_data->uses_drawid;
5188
5189 struct mi_builder b;
5190 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5191
5192 struct mi_value max =
5193 prepare_for_draw_count_predicate(cmd_buffer, &b,
5194 count_buffer, countBufferOffset);
5195
5196 for (uint32_t i = 0; i < maxDrawCount; i++) {
5197 struct anv_address draw = anv_address_add(buffer->address, offset);
5198
5199 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
5200
5201 mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i);
5202
5203 emit_indirect_3dmesh_1d(&cmd_buffer->batch, true, uses_drawid);
5204
5205 offset += stride;
5206 }
5207 }
5208 #endif /* GFX_VERx10 >= 125 */
5209
5210 void
genX(cmd_buffer_flush_compute_state)5211 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
5212 {
5213 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
5214 struct anv_compute_pipeline *pipeline = comp_state->pipeline;
5215
5216 assert(pipeline->cs);
5217
5218 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
5219
5220 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
5221
5222 /* Apply any pending pipeline flushes we may have. We want to apply them
5223 * now because, if any of those flushes are for things like push constants,
5224 * the GPU will read the state at weird times.
5225 */
5226 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5227
5228 if (cmd_buffer->state.compute.pipeline_dirty) {
5229 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
5230 *
5231 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
5232 * the only bits that are changed are scoreboard related: Scoreboard
5233 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
5234 * these scoreboard related states, a MEDIA_STATE_FLUSH is
5235 * sufficient."
5236 */
5237 anv_add_pending_pipe_bits(cmd_buffer,
5238 ANV_PIPE_CS_STALL_BIT,
5239 "flush compute state");
5240 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5241
5242 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
5243
5244 /* The workgroup size of the pipeline affects our push constant layout
5245 * so flag push constants as dirty if we change the pipeline.
5246 */
5247 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5248 }
5249
5250 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
5251 cmd_buffer->state.compute.pipeline_dirty) {
5252 flush_descriptor_sets(cmd_buffer,
5253 &cmd_buffer->state.compute.base,
5254 VK_SHADER_STAGE_COMPUTE_BIT,
5255 &pipeline->cs, 1);
5256 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
5257
5258 #if GFX_VERx10 < 125
5259 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
5260 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
5261 .BindingTablePointer =
5262 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
5263 .SamplerStatePointer =
5264 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
5265 };
5266 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
5267
5268 struct anv_state state =
5269 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
5270 pipeline->interface_descriptor_data,
5271 GENX(INTERFACE_DESCRIPTOR_DATA_length),
5272 64);
5273
5274 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
5275 anv_batch_emit(&cmd_buffer->batch,
5276 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
5277 mid.InterfaceDescriptorTotalLength = size;
5278 mid.InterfaceDescriptorDataStartAddress = state.offset;
5279 }
5280 #endif
5281 }
5282
5283 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
5284 comp_state->push_data =
5285 anv_cmd_buffer_cs_push_constants(cmd_buffer);
5286
5287 #if GFX_VERx10 < 125
5288 if (comp_state->push_data.alloc_size) {
5289 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
5290 curbe.CURBETotalDataLength = comp_state->push_data.alloc_size;
5291 curbe.CURBEDataStartAddress = comp_state->push_data.offset;
5292 }
5293 }
5294 #endif
5295
5296 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
5297 }
5298
5299 cmd_buffer->state.compute.pipeline_dirty = false;
5300
5301 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5302 }
5303
5304 #if GFX_VER == 7
5305
5306 static VkResult
verify_cmd_parser(const struct anv_device * device,int required_version,const char * function)5307 verify_cmd_parser(const struct anv_device *device,
5308 int required_version,
5309 const char *function)
5310 {
5311 if (device->physical->cmd_parser_version < required_version) {
5312 return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT,
5313 "cmd parser version %d is required for %s",
5314 required_version, function);
5315 } else {
5316 return VK_SUCCESS;
5317 }
5318 }
5319
5320 #endif
5321
5322 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)5323 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
5324 uint32_t baseGroupX,
5325 uint32_t baseGroupY,
5326 uint32_t baseGroupZ)
5327 {
5328 if (anv_batch_has_error(&cmd_buffer->batch))
5329 return;
5330
5331 struct anv_push_constants *push =
5332 &cmd_buffer->state.compute.base.push_constants;
5333 if (push->cs.base_work_group_id[0] != baseGroupX ||
5334 push->cs.base_work_group_id[1] != baseGroupY ||
5335 push->cs.base_work_group_id[2] != baseGroupZ) {
5336 push->cs.base_work_group_id[0] = baseGroupX;
5337 push->cs.base_work_group_id[1] = baseGroupY;
5338 push->cs.base_work_group_id[2] = baseGroupZ;
5339
5340 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5341 }
5342 }
5343
genX(CmdDispatch)5344 void genX(CmdDispatch)(
5345 VkCommandBuffer commandBuffer,
5346 uint32_t x,
5347 uint32_t y,
5348 uint32_t z)
5349 {
5350 genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z);
5351 }
5352
5353 #if GFX_VERx10 >= 125
5354
5355 static inline void
emit_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)5356 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
5357 const struct anv_compute_pipeline *pipeline, bool indirect,
5358 const struct brw_cs_prog_data *prog_data,
5359 uint32_t groupCountX, uint32_t groupCountY,
5360 uint32_t groupCountZ)
5361 {
5362 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
5363 const struct anv_shader_bin *cs_bin = pipeline->cs;
5364 bool predicate = cmd_buffer->state.conditional_render_enabled;
5365
5366 const struct intel_device_info *devinfo = &pipeline->base.device->info;
5367 const struct brw_cs_dispatch_info dispatch =
5368 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
5369
5370 anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
5371 cw.IndirectParameterEnable = indirect;
5372 cw.PredicateEnable = predicate;
5373 cw.SIMDSize = dispatch.simd_size / 16;
5374 cw.IndirectDataStartAddress = comp_state->push_data.offset;
5375 cw.IndirectDataLength = comp_state->push_data.alloc_size;
5376 cw.LocalXMaximum = prog_data->local_size[0] - 1;
5377 cw.LocalYMaximum = prog_data->local_size[1] - 1;
5378 cw.LocalZMaximum = prog_data->local_size[2] - 1;
5379 cw.ThreadGroupIDXDimension = groupCountX;
5380 cw.ThreadGroupIDYDimension = groupCountY;
5381 cw.ThreadGroupIDZDimension = groupCountZ;
5382 cw.ExecutionMask = dispatch.right_mask;
5383 cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0);
5384
5385 cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
5386 .KernelStartPointer = cs_bin->kernel.offset,
5387 .SamplerStatePointer =
5388 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
5389 .BindingTablePointer =
5390 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
5391 .BindingTableEntryCount =
5392 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
5393 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
5394 .SharedLocalMemorySize = encode_slm_size(GFX_VER,
5395 prog_data->base.total_shared),
5396 .NumberOfBarriers = prog_data->uses_barrier,
5397 };
5398 }
5399 }
5400
5401 #else /* #if GFX_VERx10 >= 125 */
5402
5403 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)5404 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
5405 const struct anv_compute_pipeline *pipeline, bool indirect,
5406 const struct brw_cs_prog_data *prog_data,
5407 uint32_t groupCountX, uint32_t groupCountY,
5408 uint32_t groupCountZ)
5409 {
5410 bool predicate = (GFX_VER <= 7 && indirect) ||
5411 cmd_buffer->state.conditional_render_enabled;
5412
5413 const struct intel_device_info *devinfo = &pipeline->base.device->info;
5414 const struct brw_cs_dispatch_info dispatch =
5415 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
5416
5417 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
5418 ggw.IndirectParameterEnable = indirect;
5419 ggw.PredicateEnable = predicate;
5420 ggw.SIMDSize = dispatch.simd_size / 16;
5421 ggw.ThreadDepthCounterMaximum = 0;
5422 ggw.ThreadHeightCounterMaximum = 0;
5423 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
5424 ggw.ThreadGroupIDXDimension = groupCountX;
5425 ggw.ThreadGroupIDYDimension = groupCountY;
5426 ggw.ThreadGroupIDZDimension = groupCountZ;
5427 ggw.RightExecutionMask = dispatch.right_mask;
5428 ggw.BottomExecutionMask = 0xffffffff;
5429 }
5430
5431 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
5432 }
5433
5434 #endif /* #if GFX_VERx10 >= 125 */
5435
5436 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)5437 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
5438 const struct anv_compute_pipeline *pipeline, bool indirect,
5439 const struct brw_cs_prog_data *prog_data,
5440 uint32_t groupCountX, uint32_t groupCountY,
5441 uint32_t groupCountZ)
5442 {
5443 #if GFX_VERx10 >= 125
5444 emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
5445 groupCountY, groupCountZ);
5446 #else
5447 emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
5448 groupCountY, groupCountZ);
5449 #endif
5450 }
5451
genX(CmdDispatchBase)5452 void genX(CmdDispatchBase)(
5453 VkCommandBuffer commandBuffer,
5454 uint32_t baseGroupX,
5455 uint32_t baseGroupY,
5456 uint32_t baseGroupZ,
5457 uint32_t groupCountX,
5458 uint32_t groupCountY,
5459 uint32_t groupCountZ)
5460 {
5461 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5462 struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
5463 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
5464
5465 anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
5466 baseGroupY, baseGroupZ);
5467
5468 if (anv_batch_has_error(&cmd_buffer->batch))
5469 return;
5470
5471 anv_measure_snapshot(cmd_buffer,
5472 INTEL_SNAPSHOT_COMPUTE,
5473 "compute",
5474 groupCountX * groupCountY * groupCountZ *
5475 prog_data->local_size[0] * prog_data->local_size[1] *
5476 prog_data->local_size[2]);
5477
5478 trace_intel_begin_compute(&cmd_buffer->trace, cmd_buffer);
5479
5480 if (prog_data->uses_num_work_groups) {
5481 struct anv_state state =
5482 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
5483 uint32_t *sizes = state.map;
5484 sizes[0] = groupCountX;
5485 sizes[1] = groupCountY;
5486 sizes[2] = groupCountZ;
5487 cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
5488 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5489 .offset = state.offset,
5490 };
5491
5492 /* The num_workgroups buffer goes in the binding table */
5493 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5494 }
5495
5496 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
5497
5498 if (cmd_buffer->state.conditional_render_enabled)
5499 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5500
5501 emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
5502 groupCountY, groupCountZ);
5503
5504 trace_intel_end_compute(&cmd_buffer->trace, cmd_buffer,
5505 groupCountX, groupCountY, groupCountZ);
5506 }
5507
5508 #define GPGPU_DISPATCHDIMX 0x2500
5509 #define GPGPU_DISPATCHDIMY 0x2504
5510 #define GPGPU_DISPATCHDIMZ 0x2508
5511
genX(CmdDispatchIndirect)5512 void genX(CmdDispatchIndirect)(
5513 VkCommandBuffer commandBuffer,
5514 VkBuffer _buffer,
5515 VkDeviceSize offset)
5516 {
5517 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5518 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5519 struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
5520 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
5521 struct anv_address addr = anv_address_add(buffer->address, offset);
5522 UNUSED struct anv_batch *batch = &cmd_buffer->batch;
5523
5524 anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
5525
5526 #if GFX_VER == 7
5527 /* Linux 4.4 added command parser version 5 which allows the GPGPU
5528 * indirect dispatch registers to be written.
5529 */
5530 if (verify_cmd_parser(cmd_buffer->device, 5,
5531 "vkCmdDispatchIndirect") != VK_SUCCESS)
5532 return;
5533 #endif
5534
5535 anv_measure_snapshot(cmd_buffer,
5536 INTEL_SNAPSHOT_COMPUTE,
5537 "compute indirect",
5538 0);
5539 trace_intel_begin_compute(&cmd_buffer->trace, cmd_buffer);
5540
5541 if (prog_data->uses_num_work_groups) {
5542 cmd_buffer->state.compute.num_workgroups = addr;
5543
5544 /* The num_workgroups buffer goes in the binding table */
5545 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5546 }
5547
5548 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
5549
5550 struct mi_builder b;
5551 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5552
5553 struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
5554 struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
5555 struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
5556
5557 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
5558 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
5559 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
5560
5561 #if GFX_VER <= 7
5562 /* predicate = (compute_dispatch_indirect_x_size == 0); */
5563 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
5564 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5565 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5566 mip.LoadOperation = LOAD_LOAD;
5567 mip.CombineOperation = COMBINE_SET;
5568 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5569 }
5570
5571 /* predicate |= (compute_dispatch_indirect_y_size == 0); */
5572 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
5573 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5574 mip.LoadOperation = LOAD_LOAD;
5575 mip.CombineOperation = COMBINE_OR;
5576 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5577 }
5578
5579 /* predicate |= (compute_dispatch_indirect_z_size == 0); */
5580 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
5581 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5582 mip.LoadOperation = LOAD_LOAD;
5583 mip.CombineOperation = COMBINE_OR;
5584 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5585 }
5586
5587 /* predicate = !predicate; */
5588 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5589 mip.LoadOperation = LOAD_LOADINV;
5590 mip.CombineOperation = COMBINE_OR;
5591 mip.CompareOperation = COMPARE_FALSE;
5592 }
5593
5594 #if GFX_VERx10 == 75
5595 if (cmd_buffer->state.conditional_render_enabled) {
5596 /* predicate &= !(conditional_rendering_predicate == 0); */
5597 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
5598 mi_reg32(ANV_PREDICATE_RESULT_REG));
5599 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5600 mip.LoadOperation = LOAD_LOADINV;
5601 mip.CombineOperation = COMBINE_AND;
5602 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5603 }
5604 }
5605 #endif
5606
5607 #else /* GFX_VER > 7 */
5608 if (cmd_buffer->state.conditional_render_enabled)
5609 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5610 #endif
5611
5612 emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
5613
5614 trace_intel_end_compute(&cmd_buffer->trace, cmd_buffer, 0, 0, 0);
5615 }
5616
5617 struct anv_state
genX(cmd_buffer_ray_query_globals)5618 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
5619 {
5620 #if GFX_VERx10 >= 125
5621 struct anv_device *device = cmd_buffer->device;
5622
5623 struct anv_state state =
5624 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
5625 BRW_RT_DISPATCH_GLOBALS_SIZE,
5626 64);
5627 struct brw_rt_scratch_layout layout;
5628 uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
5629 * some cases?
5630 */
5631 brw_rt_compute_scratch_layout(&layout, &device->info,
5632 stack_ids_per_dss, 1 << 10);
5633
5634 struct GFX_RT_DISPATCH_GLOBALS rtdg = {
5635 .MemBaseAddress = (struct anv_address) {
5636 /* The ray query HW computes offsets from the top of the buffer, so
5637 * let the address at the end of the buffer.
5638 */
5639 .bo = device->ray_query_bo,
5640 .offset = device->ray_query_bo->size
5641 },
5642 .AsyncRTStackSize = layout.ray_stack_stride / 64,
5643 .NumDSSRTStacks = layout.stack_ids_per_dss,
5644 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
5645 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
5646 .ResumeShaderTable = (struct anv_address) {
5647 .bo = cmd_buffer->state.ray_query_shadow_bo,
5648 },
5649 };
5650 GFX_RT_DISPATCH_GLOBALS_pack(NULL, state.map, &rtdg);
5651
5652 return state;
5653 #else
5654 unreachable("Not supported");
5655 #endif
5656 }
5657
5658 #if GFX_VERx10 >= 125
5659 static void
calc_local_trace_size(uint8_t local_shift[3],const uint32_t global[3])5660 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
5661 {
5662 unsigned total_shift = 0;
5663 memset(local_shift, 0, 3);
5664
5665 bool progress;
5666 do {
5667 progress = false;
5668 for (unsigned i = 0; i < 3; i++) {
5669 assert(global[i] > 0);
5670 if ((1 << local_shift[i]) < global[i]) {
5671 progress = true;
5672 local_shift[i]++;
5673 total_shift++;
5674 }
5675
5676 if (total_shift == 3)
5677 return;
5678 }
5679 } while(progress);
5680
5681 /* Assign whatever's left to x */
5682 local_shift[0] += 3 - total_shift;
5683 }
5684
5685 static struct GFX_RT_SHADER_TABLE
vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR * region)5686 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
5687 {
5688 return (struct GFX_RT_SHADER_TABLE) {
5689 .BaseAddress = anv_address_from_u64(region->deviceAddress),
5690 .Stride = region->stride,
5691 };
5692 }
5693
5694 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer * cmd_buffer,const VkStridedDeviceAddressRegionKHR * raygen_sbt,const VkStridedDeviceAddressRegionKHR * miss_sbt,const VkStridedDeviceAddressRegionKHR * hit_sbt,const VkStridedDeviceAddressRegionKHR * callable_sbt,bool is_indirect,uint32_t launch_width,uint32_t launch_height,uint32_t launch_depth,uint64_t launch_size_addr)5695 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
5696 const VkStridedDeviceAddressRegionKHR *raygen_sbt,
5697 const VkStridedDeviceAddressRegionKHR *miss_sbt,
5698 const VkStridedDeviceAddressRegionKHR *hit_sbt,
5699 const VkStridedDeviceAddressRegionKHR *callable_sbt,
5700 bool is_indirect,
5701 uint32_t launch_width,
5702 uint32_t launch_height,
5703 uint32_t launch_depth,
5704 uint64_t launch_size_addr)
5705 {
5706 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
5707 struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;
5708
5709 if (anv_batch_has_error(&cmd_buffer->batch))
5710 return;
5711
5712 /* If we have a known degenerate launch size, just bail */
5713 if (!is_indirect &&
5714 (launch_width == 0 || launch_height == 0 || launch_depth == 0))
5715 return;
5716
5717 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
5718 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
5719
5720 cmd_buffer->state.rt.pipeline_dirty = false;
5721
5722 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5723
5724 /* Add these to the reloc list as they're internal buffers that don't
5725 * actually have relocs to pick them up manually.
5726 *
5727 * TODO(RT): This is a bit of a hack
5728 */
5729 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
5730 cmd_buffer->batch.alloc,
5731 rt->scratch.bo);
5732
5733 /* Allocate and set up our RT_DISPATCH_GLOBALS */
5734 struct anv_state rtdg_state =
5735 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
5736 BRW_RT_PUSH_CONST_OFFSET +
5737 sizeof(struct anv_push_constants),
5738 64);
5739
5740 struct GFX_RT_DISPATCH_GLOBALS rtdg = {
5741 .MemBaseAddress = (struct anv_address) {
5742 .bo = rt->scratch.bo,
5743 .offset = rt->scratch.layout.ray_stack_start,
5744 },
5745 .CallStackHandler =
5746 anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0),
5747 .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
5748 .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
5749 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
5750 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
5751 .HitGroupTable = vk_sdar_to_shader_table(hit_sbt),
5752 .MissGroupTable = vk_sdar_to_shader_table(miss_sbt),
5753 .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
5754 .LaunchWidth = launch_width,
5755 .LaunchHeight = launch_height,
5756 .LaunchDepth = launch_depth,
5757 .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),
5758 };
5759 GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);
5760
5761 /* Push constants go after the RT_DISPATCH_GLOBALS */
5762 assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);
5763 memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
5764 &cmd_buffer->state.rt.base.push_constants,
5765 sizeof(struct anv_push_constants));
5766
5767 struct anv_address rtdg_addr = {
5768 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5769 .offset = rtdg_state.offset,
5770 };
5771
5772 uint8_t local_size_log2[3];
5773 uint32_t global_size[3] = {};
5774 if (is_indirect) {
5775 /* Pick a local size that's probably ok. We assume most TraceRays calls
5776 * will use a two-dimensional dispatch size. Worst case, our initial
5777 * dispatch will be a little slower than it has to be.
5778 */
5779 local_size_log2[0] = 2;
5780 local_size_log2[1] = 1;
5781 local_size_log2[2] = 0;
5782
5783 struct mi_builder b;
5784 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5785
5786 struct mi_value launch_size[3] = {
5787 mi_mem32(anv_address_from_u64(launch_size_addr + 0)),
5788 mi_mem32(anv_address_from_u64(launch_size_addr + 4)),
5789 mi_mem32(anv_address_from_u64(launch_size_addr + 8)),
5790 };
5791
5792 /* Store the original launch size into RT_DISPATCH_GLOBALS
5793 *
5794 * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets
5795 * moved into a genX version.
5796 */
5797 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),
5798 mi_value_ref(&b, launch_size[0]));
5799 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),
5800 mi_value_ref(&b, launch_size[1]));
5801 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),
5802 mi_value_ref(&b, launch_size[2]));
5803
5804 /* Compute the global dispatch size */
5805 for (unsigned i = 0; i < 3; i++) {
5806 if (local_size_log2[i] == 0)
5807 continue;
5808
5809 /* global_size = DIV_ROUND_UP(launch_size, local_size)
5810 *
5811 * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
5812 * has the semantics of shifting the enture 64-bit value and taking
5813 * the bottom 32 so we don't have to worry about roll-over.
5814 */
5815 uint32_t local_size = 1 << local_size_log2[i];
5816 launch_size[i] = mi_iadd(&b, launch_size[i],
5817 mi_imm(local_size - 1));
5818 launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
5819 local_size_log2[i]);
5820 }
5821
5822 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
5823 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
5824 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
5825 } else {
5826 uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };
5827 calc_local_trace_size(local_size_log2, launch_size);
5828
5829 for (unsigned i = 0; i < 3; i++) {
5830 /* We have to be a bit careful here because DIV_ROUND_UP adds to the
5831 * numerator value may overflow. Cast to uint64_t to avoid this.
5832 */
5833 uint32_t local_size = 1 << local_size_log2[i];
5834 global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);
5835 }
5836 }
5837
5838 anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
5839 cw.IndirectParameterEnable = is_indirect;
5840 cw.PredicateEnable = false;
5841 cw.SIMDSize = SIMD8;
5842 cw.LocalXMaximum = (1 << local_size_log2[0]) - 1;
5843 cw.LocalYMaximum = (1 << local_size_log2[1]) - 1;
5844 cw.LocalZMaximum = (1 << local_size_log2[2]) - 1;
5845 cw.ThreadGroupIDXDimension = global_size[0];
5846 cw.ThreadGroupIDYDimension = global_size[1];
5847 cw.ThreadGroupIDZDimension = global_size[2];
5848 cw.ExecutionMask = 0xff;
5849 cw.EmitInlineParameter = true;
5850 cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0);
5851
5852 const gl_shader_stage s = MESA_SHADER_RAYGEN;
5853 struct anv_device *device = cmd_buffer->device;
5854 struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
5855 struct anv_state *samplers = &cmd_buffer->state.samplers[s];
5856 cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
5857 .KernelStartPointer = device->rt_trampoline->kernel.offset,
5858 .SamplerStatePointer = samplers->offset,
5859 /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
5860 .SamplerCount = 0,
5861 .BindingTablePointer = surfaces->offset,
5862 .NumberofThreadsinGPGPUThreadGroup = 1,
5863 .BTDMode = true,
5864 };
5865
5866 struct brw_rt_raygen_trampoline_params trampoline_params = {
5867 .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
5868 .raygen_bsr_addr = raygen_sbt->deviceAddress,
5869 .is_indirect = is_indirect,
5870 .local_group_size_log2 = {
5871 local_size_log2[0],
5872 local_size_log2[1],
5873 local_size_log2[2],
5874 },
5875 };
5876 STATIC_ASSERT(sizeof(trampoline_params) == 32);
5877 memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
5878 }
5879 }
5880
5881 void
genX(CmdTraceRaysKHR)5882 genX(CmdTraceRaysKHR)(
5883 VkCommandBuffer commandBuffer,
5884 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
5885 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
5886 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
5887 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
5888 uint32_t width,
5889 uint32_t height,
5890 uint32_t depth)
5891 {
5892 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5893
5894 cmd_buffer_trace_rays(cmd_buffer,
5895 pRaygenShaderBindingTable,
5896 pMissShaderBindingTable,
5897 pHitShaderBindingTable,
5898 pCallableShaderBindingTable,
5899 false /* is_indirect */,
5900 width, height, depth,
5901 0 /* launch_size_addr */);
5902 }
5903
5904 void
genX(CmdTraceRaysIndirectKHR)5905 genX(CmdTraceRaysIndirectKHR)(
5906 VkCommandBuffer commandBuffer,
5907 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
5908 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
5909 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
5910 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
5911 VkDeviceAddress indirectDeviceAddress)
5912 {
5913 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5914
5915 cmd_buffer_trace_rays(cmd_buffer,
5916 pRaygenShaderBindingTable,
5917 pMissShaderBindingTable,
5918 pHitShaderBindingTable,
5919 pCallableShaderBindingTable,
5920 true /* is_indirect */,
5921 0, 0, 0, /* width, height, depth, */
5922 indirectDeviceAddress);
5923 }
5924 #endif /* GFX_VERx10 >= 125 */
5925
5926 static void
genX(flush_pipeline_select)5927 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
5928 uint32_t pipeline)
5929 {
5930 UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;
5931
5932 if (cmd_buffer->state.current_pipeline == pipeline)
5933 return;
5934
5935 #if GFX_VER >= 8 && GFX_VER < 10
5936 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
5937 *
5938 * Software must clear the COLOR_CALC_STATE Valid field in
5939 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
5940 * with Pipeline Select set to GPGPU.
5941 *
5942 * The internal hardware docs recommend the same workaround for Gfx9
5943 * hardware too.
5944 */
5945 if (pipeline == GPGPU)
5946 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
5947 #endif
5948
5949 #if GFX_VER == 9
5950 if (pipeline == _3D) {
5951 /* There is a mid-object preemption workaround which requires you to
5952 * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However,
5953 * even without preemption, we have issues with geometry flickering when
5954 * GPGPU and 3D are back-to-back and this seems to fix it. We don't
5955 * really know why.
5956 */
5957 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
5958 vfe.MaximumNumberofThreads =
5959 devinfo->max_cs_threads * devinfo->subslice_total - 1;
5960 vfe.NumberofURBEntries = 2;
5961 vfe.URBEntryAllocationSize = 2;
5962 }
5963
5964 /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
5965 * invalid. Set the compute pipeline to dirty to force a re-emit of the
5966 * pipeline in case we get back-to-back dispatch calls with the same
5967 * pipeline and a PIPELINE_SELECT in between.
5968 */
5969 cmd_buffer->state.compute.pipeline_dirty = true;
5970 }
5971 #endif
5972
5973 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
5974 * PIPELINE_SELECT [DevBWR+]":
5975 *
5976 * Project: DEVSNB+
5977 *
5978 * Software must ensure all the write caches are flushed through a
5979 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
5980 * command to invalidate read only caches prior to programming
5981 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
5982 *
5983 * Note the cmd_buffer_apply_pipe_flushes will split this into two
5984 * PIPE_CONTROLs.
5985 */
5986 anv_add_pending_pipe_bits(cmd_buffer,
5987 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
5988 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
5989 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
5990 ANV_PIPE_CS_STALL_BIT |
5991 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5992 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
5993 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
5994 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT,
5995 "flush and invalidate for PIPELINE_SELECT");
5996 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5997
5998 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
5999 #if GFX_VER >= 9
6000 ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
6001 ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
6002 #endif
6003 ps.PipelineSelection = pipeline;
6004 }
6005
6006 #if GFX_VER == 9
6007 if (devinfo->platform == INTEL_PLATFORM_GLK) {
6008 /* Project: DevGLK
6009 *
6010 * "This chicken bit works around a hardware issue with barrier logic
6011 * encountered when switching between GPGPU and 3D pipelines. To
6012 * workaround the issue, this mode bit should be set after a pipeline
6013 * is selected."
6014 */
6015 anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
6016 scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
6017 : GLK_BARRIER_MODE_3D_HULL;
6018 scec1.GLKBarrierModeMask = 1;
6019 }
6020 }
6021 #endif
6022
6023 cmd_buffer->state.current_pipeline = pipeline;
6024 }
6025
6026 void
genX(flush_pipeline_select_3d)6027 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
6028 {
6029 genX(flush_pipeline_select)(cmd_buffer, _3D);
6030 }
6031
6032 void
genX(flush_pipeline_select_gpgpu)6033 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
6034 {
6035 genX(flush_pipeline_select)(cmd_buffer, GPGPU);
6036 }
6037
6038 void
genX(cmd_buffer_emit_gfx7_depth_flush)6039 genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
6040 {
6041 if (GFX_VER >= 8)
6042 return;
6043
6044 /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
6045 *
6046 * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
6047 * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
6048 * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
6049 * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
6050 * set), followed by a pipelined depth cache flush (PIPE_CONTROL with
6051 * Depth Flush Bit set, followed by another pipelined depth stall
6052 * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
6053 * guarantee that the pipeline from WM onwards is already flushed (e.g.,
6054 * via a preceding MI_FLUSH)."
6055 */
6056 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6057 pipe.DepthStallEnable = true;
6058 anv_debug_dump_pc(pipe);
6059 }
6060 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6061 pipe.DepthCacheFlushEnable = true;
6062 #if GFX_VER >= 12
6063 pipe.TileCacheFlushEnable = true;
6064 #endif
6065 anv_debug_dump_pc(pipe);
6066 }
6067 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6068 pipe.DepthStallEnable = true;
6069 anv_debug_dump_pc(pipe);
6070 }
6071 }
6072
6073 void
genX(cmd_buffer_emit_gfx12_depth_wa)6074 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
6075 const struct isl_surf *surf)
6076 {
6077 #if GFX_VERx10 == 120
6078 const bool fmt_is_d16 = surf->format == ISL_FORMAT_R16_UNORM;
6079
6080 switch (cmd_buffer->state.depth_reg_mode) {
6081 case ANV_DEPTH_REG_MODE_HW_DEFAULT:
6082 if (!fmt_is_d16)
6083 return;
6084 break;
6085 case ANV_DEPTH_REG_MODE_D16:
6086 if (fmt_is_d16)
6087 return;
6088 break;
6089 case ANV_DEPTH_REG_MODE_UNKNOWN:
6090 break;
6091 }
6092
6093 /* We'll change some CHICKEN registers depending on the depth surface
6094 * format. Do a depth flush and stall so the pipeline is not using these
6095 * settings while we change the registers.
6096 */
6097 anv_add_pending_pipe_bits(cmd_buffer,
6098 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
6099 ANV_PIPE_DEPTH_STALL_BIT |
6100 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
6101 "Workaround: Stop pipeline for 14010455700");
6102 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6103
6104 /* Wa_14010455700
6105 *
6106 * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
6107 * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
6108 */
6109 anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
6110 reg.HIZPlaneOptimizationdisablebit = fmt_is_d16 && surf->samples == 1;
6111 reg.HIZPlaneOptimizationdisablebitMask = true;
6112 }
6113
6114 /* Wa_1806527549
6115 *
6116 * Set HIZ_CHICKEN (7018h) bit 13 = 1 when depth buffer is D16_UNORM.
6117 */
6118 anv_batch_write_reg(&cmd_buffer->batch, GENX(HIZ_CHICKEN), reg) {
6119 reg.HZDepthTestLEGEOptimizationDisable = fmt_is_d16;
6120 reg.HZDepthTestLEGEOptimizationDisableMask = true;
6121 }
6122
6123 cmd_buffer->state.depth_reg_mode =
6124 fmt_is_d16 ? ANV_DEPTH_REG_MODE_D16 : ANV_DEPTH_REG_MODE_HW_DEFAULT;
6125 #endif
6126 }
6127
6128 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
6129 *
6130 * "The VF cache needs to be invalidated before binding and then using
6131 * Vertex Buffers that overlap with any previously bound Vertex Buffer
6132 * (at a 64B granularity) since the last invalidation. A VF cache
6133 * invalidate is performed by setting the "VF Cache Invalidation Enable"
6134 * bit in PIPE_CONTROL."
6135 *
6136 * This is implemented by carefully tracking all vertex and index buffer
6137 * bindings and flushing if the cache ever ends up with a range in the cache
6138 * that would exceed 4 GiB. This is implemented in three parts:
6139 *
6140 * 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
6141 * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
6142 * tracking code of the new binding. If this new binding would cause
6143 * the cache to have a too-large range on the next draw call, a pipeline
6144 * stall and VF cache invalidate are added to pending_pipeline_bits.
6145 *
6146 * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
6147 * empty whenever we emit a VF invalidate.
6148 *
6149 * 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
6150 * after every 3DPRIMITIVE and copies the bound range into the dirty
6151 * range for each used buffer. This has to be a separate step because
6152 * we don't always re-bind all buffers and so 1. can't know which
6153 * buffers are actually bound.
6154 */
6155 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)6156 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
6157 int vb_index,
6158 struct anv_address vb_address,
6159 uint32_t vb_size)
6160 {
6161 if (GFX_VER < 8 || GFX_VER > 9 ||
6162 anv_use_relocations(cmd_buffer->device->physical))
6163 return;
6164
6165 struct anv_vb_cache_range *bound, *dirty;
6166 if (vb_index == -1) {
6167 bound = &cmd_buffer->state.gfx.ib_bound_range;
6168 dirty = &cmd_buffer->state.gfx.ib_dirty_range;
6169 } else {
6170 assert(vb_index >= 0);
6171 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
6172 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
6173 bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
6174 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
6175 }
6176
6177 if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
6178 vb_address,
6179 vb_size)) {
6180 anv_add_pending_pipe_bits(cmd_buffer,
6181 ANV_PIPE_CS_STALL_BIT |
6182 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
6183 "vb > 32b range");
6184 }
6185 }
6186
6187 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)6188 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
6189 uint32_t access_type,
6190 uint64_t vb_used)
6191 {
6192 if (GFX_VER < 8 || GFX_VER > 9 ||
6193 anv_use_relocations(cmd_buffer->device->physical))
6194 return;
6195
6196 if (access_type == RANDOM) {
6197 /* We have an index buffer */
6198 struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
6199 struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
6200
6201 if (bound->end > bound->start) {
6202 dirty->start = MIN2(dirty->start, bound->start);
6203 dirty->end = MAX2(dirty->end, bound->end);
6204 }
6205 }
6206
6207 uint64_t mask = vb_used;
6208 while (mask) {
6209 int i = u_bit_scan64(&mask);
6210 assert(i >= 0);
6211 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
6212 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
6213
6214 struct anv_vb_cache_range *bound, *dirty;
6215 bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
6216 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
6217
6218 if (bound->end > bound->start) {
6219 dirty->start = MIN2(dirty->start, bound->start);
6220 dirty->end = MAX2(dirty->end, bound->end);
6221 }
6222 }
6223 }
6224
6225 /**
6226 * Update the pixel hashing modes that determine the balancing of PS threads
6227 * across subslices and slices.
6228 *
6229 * \param width Width bound of the rendering area (already scaled down if \p
6230 * scale is greater than 1).
6231 * \param height Height bound of the rendering area (already scaled down if \p
6232 * scale is greater than 1).
6233 * \param scale The number of framebuffer samples that could potentially be
6234 * affected by an individual channel of the PS thread. This is
6235 * typically one for single-sampled rendering, but for operations
6236 * like CCS resolves and fast clears a single PS invocation may
6237 * update a huge number of pixels, in which case a finer
6238 * balancing is desirable in order to maximally utilize the
6239 * bandwidth available. UINT_MAX can be used as shorthand for
6240 * "finest hashing mode available".
6241 */
6242 void
genX(cmd_buffer_emit_hashing_mode)6243 genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
6244 unsigned width, unsigned height,
6245 unsigned scale)
6246 {
6247 #if GFX_VER == 9
6248 const struct intel_device_info *devinfo = &cmd_buffer->device->info;
6249 const unsigned slice_hashing[] = {
6250 /* Because all Gfx9 platforms with more than one slice require
6251 * three-way subslice hashing, a single "normal" 16x16 slice hashing
6252 * block is guaranteed to suffer from substantial imbalance, with one
6253 * subslice receiving twice as much work as the other two in the
6254 * slice.
6255 *
6256 * The performance impact of that would be particularly severe when
6257 * three-way hashing is also in use for slice balancing (which is the
6258 * case for all Gfx9 GT4 platforms), because one of the slices
6259 * receives one every three 16x16 blocks in either direction, which
6260 * is roughly the periodicity of the underlying subslice imbalance
6261 * pattern ("roughly" because in reality the hardware's
6262 * implementation of three-way hashing doesn't do exact modulo 3
6263 * arithmetic, which somewhat decreases the magnitude of this effect
6264 * in practice). This leads to a systematic subslice imbalance
6265 * within that slice regardless of the size of the primitive. The
6266 * 32x32 hashing mode guarantees that the subslice imbalance within a
6267 * single slice hashing block is minimal, largely eliminating this
6268 * effect.
6269 */
6270 _32x32,
6271 /* Finest slice hashing mode available. */
6272 NORMAL
6273 };
6274 const unsigned subslice_hashing[] = {
6275 /* 16x16 would provide a slight cache locality benefit especially
6276 * visible in the sampler L1 cache efficiency of low-bandwidth
6277 * non-LLC platforms, but it comes at the cost of greater subslice
6278 * imbalance for primitives of dimensions approximately intermediate
6279 * between 16x4 and 16x16.
6280 */
6281 _16x4,
6282 /* Finest subslice hashing mode available. */
6283 _8x4
6284 };
6285 /* Dimensions of the smallest hashing block of a given hashing mode. If
6286 * the rendering area is smaller than this there can't possibly be any
6287 * benefit from switching to this mode, so we optimize out the
6288 * transition.
6289 */
6290 const unsigned min_size[][2] = {
6291 { 16, 4 },
6292 { 8, 4 }
6293 };
6294 const unsigned idx = scale > 1;
6295
6296 if (cmd_buffer->state.current_hash_scale != scale &&
6297 (width > min_size[idx][0] || height > min_size[idx][1])) {
6298 anv_add_pending_pipe_bits(cmd_buffer,
6299 ANV_PIPE_CS_STALL_BIT |
6300 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
6301 "change pixel hash mode");
6302 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6303
6304 anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
6305 gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
6306 gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
6307 gt.SubsliceHashing = subslice_hashing[idx];
6308 gt.SubsliceHashingMask = -1;
6309 }
6310
6311 cmd_buffer->state.current_hash_scale = scale;
6312 }
6313 #endif
6314 }
6315
6316 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)6317 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
6318 {
6319 struct anv_device *device = cmd_buffer->device;
6320 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6321
6322 /* FIXME: Width and Height are wrong */
6323
6324 genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
6325
6326 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
6327 device->isl_dev.ds.size / 4);
6328 if (dw == NULL)
6329 return;
6330
6331 struct isl_depth_stencil_hiz_emit_info info = {
6332 .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
6333 };
6334
6335 if (gfx->depth_att.iview != NULL) {
6336 info.view = &gfx->depth_att.iview->planes[0].isl;
6337 } else if (gfx->stencil_att.iview != NULL) {
6338 info.view = &gfx->stencil_att.iview->planes[0].isl;
6339 }
6340
6341 if (gfx->depth_att.iview != NULL) {
6342 const struct anv_image_view *iview = gfx->depth_att.iview;
6343 const struct anv_image *image = iview->image;
6344
6345 info.view = &iview->planes[0].isl;
6346
6347 const uint32_t depth_plane =
6348 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
6349 const struct anv_surface *depth_surface =
6350 &image->planes[depth_plane].primary_surface;
6351 const struct anv_address depth_address =
6352 anv_image_address(image, &depth_surface->memory_range);
6353
6354 info.depth_surf = &depth_surface->isl;
6355
6356 info.depth_address =
6357 anv_batch_emit_reloc(&cmd_buffer->batch,
6358 dw + device->isl_dev.ds.depth_offset / 4,
6359 depth_address.bo, depth_address.offset);
6360 info.mocs =
6361 anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
6362
6363 info.hiz_usage = gfx->depth_att.aux_usage;
6364 if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
6365 assert(isl_aux_usage_has_hiz(info.hiz_usage));
6366
6367 const struct anv_surface *hiz_surface =
6368 &image->planes[depth_plane].aux_surface;
6369 const struct anv_address hiz_address =
6370 anv_image_address(image, &hiz_surface->memory_range);
6371
6372 info.hiz_surf = &hiz_surface->isl;
6373
6374 info.hiz_address =
6375 anv_batch_emit_reloc(&cmd_buffer->batch,
6376 dw + device->isl_dev.ds.hiz_offset / 4,
6377 hiz_address.bo, hiz_address.offset);
6378
6379 info.depth_clear_value = ANV_HZ_FC_VAL;
6380 }
6381 }
6382
6383 if (gfx->stencil_att.iview != NULL) {
6384 const struct anv_image_view *iview = gfx->stencil_att.iview;
6385 const struct anv_image *image = iview->image;
6386
6387 if (info.view == NULL)
6388 info.view = &iview->planes[0].isl;
6389
6390 const uint32_t stencil_plane =
6391 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
6392 const struct anv_surface *stencil_surface =
6393 &image->planes[stencil_plane].primary_surface;
6394 const struct anv_address stencil_address =
6395 anv_image_address(image, &stencil_surface->memory_range);
6396
6397 info.stencil_surf = &stencil_surface->isl;
6398
6399 info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
6400 info.stencil_address =
6401 anv_batch_emit_reloc(&cmd_buffer->batch,
6402 dw + device->isl_dev.ds.stencil_offset / 4,
6403 stencil_address.bo, stencil_address.offset);
6404 info.mocs =
6405 anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
6406 }
6407
6408 isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
6409
6410 if (info.depth_surf)
6411 genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
6412
6413 if (GFX_VER >= 12) {
6414 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
6415 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6416
6417 /* Wa_1408224581
6418 *
6419 * Workaround: Gfx12LP Astep only An additional pipe control with
6420 * post-sync = store dword operation would be required.( w/a is to
6421 * have an additional pipe control after the stencil state whenever
6422 * the surface state bits of this state is changing).
6423 *
6424 * This also seems sufficient to handle Wa_14014148106.
6425 */
6426 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
6427 pc.PostSyncOperation = WriteImmediateData;
6428 pc.Address = cmd_buffer->device->workaround_address;
6429 }
6430 }
6431 cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
6432 }
6433
6434 static void
cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image_view * fsr_iview)6435 cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
6436 const struct anv_image_view *fsr_iview)
6437 {
6438 #if GFX_VERx10 >= 125
6439 struct anv_device *device = cmd_buffer->device;
6440
6441 if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
6442 return;
6443
6444 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
6445 device->isl_dev.cpb.size / 4);
6446 if (dw == NULL)
6447 return;
6448
6449 struct isl_cpb_emit_info info = { };
6450
6451 if (fsr_iview) {
6452 info.view = &fsr_iview->planes[0].isl;
6453 info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
6454 info.address =
6455 anv_batch_emit_reloc(&cmd_buffer->batch,
6456 dw + device->isl_dev.cpb.offset / 4,
6457 fsr_iview->image->bindings[0].address.bo,
6458 fsr_iview->image->bindings[0].address.offset +
6459 fsr_iview->image->bindings[0].memory_range.offset);
6460 info.mocs =
6461 anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
6462 ISL_SURF_USAGE_CPB_BIT);
6463 }
6464
6465 isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
6466 #endif /* GFX_VERx10 >= 125 */
6467 }
6468
6469 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)6470 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
6471 {
6472 const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
6473 vk_find_struct_const(att->pNext,
6474 RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
6475 if (layout_info != NULL)
6476 return layout_info->initialLayout;
6477
6478 return att->imageLayout;
6479 }
6480
genX(CmdBeginRendering)6481 void genX(CmdBeginRendering)(
6482 VkCommandBuffer commandBuffer,
6483 const VkRenderingInfo* pRenderingInfo)
6484 {
6485 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6486 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6487 VkResult result;
6488
6489 if (!is_render_queue_cmd_buffer(cmd_buffer)) {
6490 assert(!"Trying to start a render pass on non-render queue!");
6491 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
6492 return;
6493 }
6494
6495 anv_measure_beginrenderpass(cmd_buffer);
6496 trace_intel_begin_render_pass(&cmd_buffer->trace, cmd_buffer);
6497
6498 gfx->rendering_flags = pRenderingInfo->flags;
6499 gfx->render_area = pRenderingInfo->renderArea;
6500 gfx->view_mask = pRenderingInfo->viewMask;
6501 gfx->layer_count = pRenderingInfo->layerCount;
6502 gfx->samples = 0;
6503
6504 const bool is_multiview = gfx->view_mask != 0;
6505 const VkRect2D render_area = gfx->render_area;
6506 const uint32_t layers =
6507 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
6508
6509 /* The framebuffer size is at least large enough to contain the render
6510 * area. Because a zero renderArea is possible, we MAX with 1.
6511 */
6512 struct isl_extent3d fb_size = {
6513 .w = MAX2(1, render_area.offset.x + render_area.extent.width),
6514 .h = MAX2(1, render_area.offset.y + render_area.extent.height),
6515 .d = layers,
6516 };
6517
6518 /* Reserve one for the NULL state. */
6519 uint32_t color_att_valid = 0;
6520 uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
6521 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
6522 if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
6523 color_att_valid |= BITFIELD_BIT(i);
6524 }
6525 result = anv_cmd_buffer_init_attachments(cmd_buffer,
6526 color_att_count,
6527 color_att_valid);
6528 if (result != VK_SUCCESS)
6529 return;
6530
6531 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
6532 if (!(color_att_valid & BITFIELD_BIT(i)))
6533 continue;
6534
6535 const VkRenderingAttachmentInfo *att =
6536 &pRenderingInfo->pColorAttachments[i];
6537 ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
6538 const VkImageLayout initial_layout = attachment_initial_layout(att);
6539
6540 assert(render_area.offset.x + render_area.extent.width <=
6541 iview->vk.extent.width);
6542 assert(render_area.offset.y + render_area.extent.height <=
6543 iview->vk.extent.height);
6544 assert(layers <= iview->vk.layer_count);
6545
6546 fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
6547 fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
6548
6549 assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
6550 gfx->samples |= iview->vk.image->samples;
6551
6552 enum isl_aux_usage aux_usage =
6553 anv_layout_to_aux_usage(&cmd_buffer->device->info,
6554 iview->image,
6555 VK_IMAGE_ASPECT_COLOR_BIT,
6556 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
6557 att->imageLayout);
6558
6559 union isl_color_value fast_clear_color = { .u32 = { 0, } };
6560
6561 if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6562 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
6563 const union isl_color_value clear_color =
6564 vk_to_isl_color_with_format(att->clearValue.color,
6565 iview->planes[0].isl.format);
6566
6567 /* We only support fast-clears on the first layer */
6568 const bool fast_clear =
6569 (!is_multiview || (gfx->view_mask & 1)) &&
6570 anv_can_fast_clear_color_view(cmd_buffer->device, iview,
6571 att->imageLayout, clear_color,
6572 layers, render_area);
6573
6574 if (att->imageLayout != initial_layout) {
6575 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6576 render_area.extent.width == iview->vk.extent.width &&
6577 render_area.extent.height == iview->vk.extent.height);
6578 if (is_multiview) {
6579 u_foreach_bit(view, gfx->view_mask) {
6580 transition_color_buffer(cmd_buffer, iview->image,
6581 VK_IMAGE_ASPECT_COLOR_BIT,
6582 iview->vk.base_mip_level, 1,
6583 iview->vk.base_array_layer + view,
6584 1, /* layer_count */
6585 initial_layout, att->imageLayout,
6586 VK_QUEUE_FAMILY_IGNORED,
6587 VK_QUEUE_FAMILY_IGNORED,
6588 fast_clear);
6589 }
6590 } else {
6591 transition_color_buffer(cmd_buffer, iview->image,
6592 VK_IMAGE_ASPECT_COLOR_BIT,
6593 iview->vk.base_mip_level, 1,
6594 iview->vk.base_array_layer,
6595 gfx->layer_count,
6596 initial_layout, att->imageLayout,
6597 VK_QUEUE_FAMILY_IGNORED,
6598 VK_QUEUE_FAMILY_IGNORED,
6599 fast_clear);
6600 }
6601 }
6602
6603 uint32_t clear_view_mask = pRenderingInfo->viewMask;
6604 uint32_t base_clear_layer = iview->vk.base_array_layer;
6605 uint32_t clear_layer_count = gfx->layer_count;
6606 if (fast_clear) {
6607 /* We only support fast-clears on the first layer */
6608 assert(iview->vk.base_mip_level == 0 &&
6609 iview->vk.base_array_layer == 0);
6610
6611 fast_clear_color = clear_color;
6612
6613 if (iview->image->vk.samples == 1) {
6614 anv_image_ccs_op(cmd_buffer, iview->image,
6615 iview->planes[0].isl.format,
6616 iview->planes[0].isl.swizzle,
6617 VK_IMAGE_ASPECT_COLOR_BIT,
6618 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
6619 &fast_clear_color,
6620 false);
6621 } else {
6622 anv_image_mcs_op(cmd_buffer, iview->image,
6623 iview->planes[0].isl.format,
6624 iview->planes[0].isl.swizzle,
6625 VK_IMAGE_ASPECT_COLOR_BIT,
6626 0, 1, ISL_AUX_OP_FAST_CLEAR,
6627 &fast_clear_color,
6628 false);
6629 }
6630 clear_view_mask &= ~1u;
6631 base_clear_layer++;
6632 clear_layer_count--;
6633
6634 if (isl_color_value_is_zero(clear_color,
6635 iview->planes[0].isl.format)) {
6636 /* This image has the auxiliary buffer enabled. We can mark the
6637 * subresource as not needing a resolve because the clear color
6638 * will match what's in every RENDER_SURFACE_STATE object when
6639 * it's being used for sampling.
6640 */
6641 set_image_fast_clear_state(cmd_buffer, iview->image,
6642 VK_IMAGE_ASPECT_COLOR_BIT,
6643 ANV_FAST_CLEAR_DEFAULT_VALUE);
6644 } else {
6645 set_image_fast_clear_state(cmd_buffer, iview->image,
6646 VK_IMAGE_ASPECT_COLOR_BIT,
6647 ANV_FAST_CLEAR_ANY);
6648 }
6649 }
6650
6651 if (is_multiview) {
6652 u_foreach_bit(view, clear_view_mask) {
6653 anv_image_clear_color(cmd_buffer, iview->image,
6654 VK_IMAGE_ASPECT_COLOR_BIT,
6655 aux_usage,
6656 iview->planes[0].isl.format,
6657 iview->planes[0].isl.swizzle,
6658 iview->vk.base_mip_level,
6659 iview->vk.base_array_layer + view, 1,
6660 render_area, clear_color);
6661 }
6662 } else {
6663 anv_image_clear_color(cmd_buffer, iview->image,
6664 VK_IMAGE_ASPECT_COLOR_BIT,
6665 aux_usage,
6666 iview->planes[0].isl.format,
6667 iview->planes[0].isl.swizzle,
6668 iview->vk.base_mip_level,
6669 base_clear_layer, clear_layer_count,
6670 render_area, clear_color);
6671 }
6672 } else {
6673 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
6674 assert(att->imageLayout == initial_layout);
6675 }
6676
6677 gfx->color_att[i].vk_format = iview->vk.format;
6678 gfx->color_att[i].iview = iview;
6679 gfx->color_att[i].layout = att->imageLayout;
6680 gfx->color_att[i].aux_usage = aux_usage;
6681
6682 anv_image_fill_surface_state(cmd_buffer->device,
6683 iview->image,
6684 VK_IMAGE_ASPECT_COLOR_BIT,
6685 &iview->planes[0].isl,
6686 ISL_SURF_USAGE_RENDER_TARGET_BIT,
6687 aux_usage, &fast_clear_color,
6688 0, /* anv_image_view_state_flags */
6689 &gfx->color_att[i].surface_state,
6690 NULL);
6691
6692 add_surface_state_relocs(cmd_buffer, gfx->color_att[i].surface_state);
6693
6694 if (GFX_VER < 10 &&
6695 (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
6696 (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
6697 iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
6698 iview->planes[0].isl.base_level == 0 &&
6699 iview->planes[0].isl.base_array_layer == 0) {
6700 genX(copy_fast_clear_dwords)(cmd_buffer,
6701 gfx->color_att[i].surface_state.state,
6702 iview->image,
6703 VK_IMAGE_ASPECT_COLOR_BIT,
6704 false /* copy to ss */);
6705 }
6706
6707 if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
6708 gfx->color_att[i].resolve_mode = att->resolveMode;
6709 gfx->color_att[i].resolve_iview =
6710 anv_image_view_from_handle(att->resolveImageView);
6711 gfx->color_att[i].resolve_layout = att->resolveImageLayout;
6712 }
6713 }
6714
6715 const struct anv_image_view *fsr_iview = NULL;
6716 const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
6717 vk_find_struct_const(pRenderingInfo->pNext,
6718 RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
6719 if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
6720 fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
6721 /* imageLayout and shadingRateAttachmentTexelSize are ignored */
6722 }
6723
6724 const struct anv_image_view *ds_iview = NULL;
6725 const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
6726 const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
6727 if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
6728 (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
6729 const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
6730 VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6731 VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6732 VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6733 VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6734 enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
6735 enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
6736 float depth_clear_value = 0;
6737 uint32_t stencil_clear_value = 0;
6738
6739 if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
6740 d_iview = anv_image_view_from_handle(d_att->imageView);
6741 initial_depth_layout = attachment_initial_layout(d_att);
6742 depth_layout = d_att->imageLayout;
6743 depth_aux_usage =
6744 anv_layout_to_aux_usage(&cmd_buffer->device->info,
6745 d_iview->image,
6746 VK_IMAGE_ASPECT_DEPTH_BIT,
6747 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
6748 depth_layout);
6749 depth_clear_value = d_att->clearValue.depthStencil.depth;
6750 }
6751
6752 if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
6753 s_iview = anv_image_view_from_handle(s_att->imageView);
6754 initial_stencil_layout = attachment_initial_layout(s_att);
6755 stencil_layout = s_att->imageLayout;
6756 stencil_aux_usage =
6757 anv_layout_to_aux_usage(&cmd_buffer->device->info,
6758 s_iview->image,
6759 VK_IMAGE_ASPECT_STENCIL_BIT,
6760 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
6761 stencil_layout);
6762 stencil_clear_value = s_att->clearValue.depthStencil.stencil;
6763 }
6764
6765 assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
6766 ds_iview = d_iview != NULL ? d_iview : s_iview;
6767 assert(ds_iview != NULL);
6768
6769 assert(render_area.offset.x + render_area.extent.width <=
6770 ds_iview->vk.extent.width);
6771 assert(render_area.offset.y + render_area.extent.height <=
6772 ds_iview->vk.extent.height);
6773 assert(layers <= ds_iview->vk.layer_count);
6774
6775 fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
6776 fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
6777
6778 assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
6779 gfx->samples |= ds_iview->vk.image->samples;
6780
6781 VkImageAspectFlags clear_aspects = 0;
6782 if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6783 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
6784 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
6785 if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6786 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
6787 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
6788
6789 if (clear_aspects != 0) {
6790 const bool hiz_clear =
6791 anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
6792 depth_layout, clear_aspects,
6793 depth_clear_value,
6794 render_area);
6795
6796 if (depth_layout != initial_depth_layout) {
6797 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6798 render_area.extent.width == d_iview->vk.extent.width &&
6799 render_area.extent.height == d_iview->vk.extent.height);
6800
6801 if (is_multiview) {
6802 u_foreach_bit(view, gfx->view_mask) {
6803 transition_depth_buffer(cmd_buffer, d_iview->image,
6804 d_iview->vk.base_array_layer + view,
6805 1 /* layer_count */,
6806 initial_depth_layout, depth_layout,
6807 hiz_clear);
6808 }
6809 } else {
6810 transition_depth_buffer(cmd_buffer, d_iview->image,
6811 d_iview->vk.base_array_layer,
6812 gfx->layer_count,
6813 initial_depth_layout, depth_layout,
6814 hiz_clear);
6815 }
6816 }
6817
6818 if (stencil_layout != initial_stencil_layout) {
6819 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6820 render_area.extent.width == s_iview->vk.extent.width &&
6821 render_area.extent.height == s_iview->vk.extent.height);
6822
6823 if (is_multiview) {
6824 u_foreach_bit(view, gfx->view_mask) {
6825 transition_stencil_buffer(cmd_buffer, s_iview->image,
6826 s_iview->vk.base_mip_level, 1,
6827 s_iview->vk.base_array_layer + view,
6828 1 /* layer_count */,
6829 initial_stencil_layout,
6830 stencil_layout,
6831 hiz_clear);
6832 }
6833 } else {
6834 transition_stencil_buffer(cmd_buffer, s_iview->image,
6835 s_iview->vk.base_mip_level, 1,
6836 s_iview->vk.base_array_layer,
6837 gfx->layer_count,
6838 initial_stencil_layout,
6839 stencil_layout,
6840 hiz_clear);
6841 }
6842 }
6843
6844 if (is_multiview) {
6845 uint32_t clear_view_mask = pRenderingInfo->viewMask;
6846 while (clear_view_mask) {
6847 int view = u_bit_scan(&clear_view_mask);
6848
6849 uint32_t level = ds_iview->vk.base_mip_level;
6850 uint32_t layer = ds_iview->vk.base_array_layer + view;
6851
6852 if (hiz_clear) {
6853 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
6854 clear_aspects,
6855 level, layer, 1,
6856 render_area,
6857 stencil_clear_value);
6858 } else {
6859 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
6860 clear_aspects,
6861 depth_aux_usage,
6862 level, layer, 1,
6863 render_area,
6864 depth_clear_value,
6865 stencil_clear_value);
6866 }
6867 }
6868 } else {
6869 uint32_t level = ds_iview->vk.base_mip_level;
6870 uint32_t base_layer = ds_iview->vk.base_array_layer;
6871 uint32_t layer_count = gfx->layer_count;
6872
6873 if (hiz_clear) {
6874 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
6875 clear_aspects,
6876 level, base_layer, layer_count,
6877 render_area,
6878 stencil_clear_value);
6879 } else {
6880 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
6881 clear_aspects,
6882 depth_aux_usage,
6883 level, base_layer, layer_count,
6884 render_area,
6885 depth_clear_value,
6886 stencil_clear_value);
6887 }
6888 }
6889 } else {
6890 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
6891 assert(depth_layout == initial_depth_layout);
6892 assert(stencil_layout == initial_stencil_layout);
6893 }
6894
6895 if (d_iview != NULL) {
6896 gfx->depth_att.vk_format = d_iview->vk.format;
6897 gfx->depth_att.iview = d_iview;
6898 gfx->depth_att.layout = depth_layout;
6899 gfx->depth_att.aux_usage = depth_aux_usage;
6900 if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
6901 assert(d_att->resolveImageView != VK_NULL_HANDLE);
6902 gfx->depth_att.resolve_mode = d_att->resolveMode;
6903 gfx->depth_att.resolve_iview =
6904 anv_image_view_from_handle(d_att->resolveImageView);
6905 gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
6906 }
6907 }
6908
6909 if (s_iview != NULL) {
6910 gfx->stencil_att.vk_format = s_iview->vk.format;
6911 gfx->stencil_att.iview = s_iview;
6912 gfx->stencil_att.layout = stencil_layout;
6913 gfx->stencil_att.aux_usage = stencil_aux_usage;
6914 if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
6915 assert(s_att->resolveImageView != VK_NULL_HANDLE);
6916 gfx->stencil_att.resolve_mode = s_att->resolveMode;
6917 gfx->stencil_att.resolve_iview =
6918 anv_image_view_from_handle(s_att->resolveImageView);
6919 gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
6920 }
6921 }
6922 }
6923
6924 /* Finally, now that we know the right size, set up the null surface */
6925 assert(util_bitcount(gfx->samples) <= 1);
6926 isl_null_fill_state(&cmd_buffer->device->isl_dev,
6927 gfx->null_surface_state.map,
6928 .size = fb_size);
6929
6930 /****** We can now start emitting code to begin the render pass ******/
6931
6932 gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
6933
6934 /* Our implementation of VK_KHR_multiview uses instancing to draw the
6935 * different views. If the client asks for instancing, we need to use the
6936 * Instance Data Step Rate to ensure that we repeat the client's
6937 * per-instance data once for each view. Since this bit is in
6938 * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
6939 * of each subpass.
6940 */
6941 if (GFX_VER == 7)
6942 gfx->vb_dirty |= ~0;
6943
6944 /* It is possible to start a render pass with an old pipeline. Because the
6945 * render pass and subpass index are both baked into the pipeline, this is
6946 * highly unlikely. In order to do so, it requires that you have a render
6947 * pass with a single subpass and that you use that render pass twice
6948 * back-to-back and use the same pipeline at the start of the second render
6949 * pass as at the end of the first. In order to avoid unpredictable issues
6950 * with this edge case, we just dirty the pipeline at the start of every
6951 * subpass.
6952 */
6953 gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
6954
6955 #if GFX_VER >= 11
6956 /* The PIPE_CONTROL command description says:
6957 *
6958 * "Whenever a Binding Table Index (BTI) used by a Render Taget Message
6959 * points to a different RENDER_SURFACE_STATE, SW must issue a Render
6960 * Target Cache Flush by enabling this bit. When render target flush
6961 * is set due to new association of BTI, PS Scoreboard Stall bit must
6962 * be set in this packet."
6963 */
6964 anv_add_pending_pipe_bits(cmd_buffer,
6965 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
6966 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
6967 "change RT");
6968 #endif
6969
6970 cmd_buffer_emit_depth_stencil(cmd_buffer);
6971
6972 cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
6973 }
6974
6975 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)6976 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
6977 struct anv_attachment *att,
6978 VkImageAspectFlagBits aspect)
6979 {
6980 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6981 const struct anv_image_view *iview = att->iview;
6982
6983 if (gfx->view_mask == 0) {
6984 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6985 aspect, att->aux_usage,
6986 iview->planes[0].isl.base_level,
6987 iview->planes[0].isl.base_array_layer,
6988 gfx->layer_count);
6989 } else {
6990 uint32_t res_view_mask = gfx->view_mask;
6991 while (res_view_mask) {
6992 int i = u_bit_scan(&res_view_mask);
6993
6994 const uint32_t level = iview->planes[0].isl.base_level;
6995 const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
6996
6997 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6998 aspect, att->aux_usage,
6999 level, layer, 1);
7000 }
7001 }
7002 }
7003
7004 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)7005 vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
7006 {
7007 switch (vk_mode) {
7008 case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
7009 return BLORP_FILTER_SAMPLE_0;
7010 case VK_RESOLVE_MODE_AVERAGE_BIT:
7011 return BLORP_FILTER_AVERAGE;
7012 case VK_RESOLVE_MODE_MIN_BIT:
7013 return BLORP_FILTER_MIN_SAMPLE;
7014 case VK_RESOLVE_MODE_MAX_BIT:
7015 return BLORP_FILTER_MAX_SAMPLE;
7016 default:
7017 return BLORP_FILTER_NONE;
7018 }
7019 }
7020
7021 static void
cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer * cmd_buffer,const struct anv_attachment * att,VkImageLayout layout,VkImageAspectFlagBits aspect)7022 cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
7023 const struct anv_attachment *att,
7024 VkImageLayout layout,
7025 VkImageAspectFlagBits aspect)
7026 {
7027 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
7028 const struct anv_image_view *src_iview = att->iview;
7029 const struct anv_image_view *dst_iview = att->resolve_iview;
7030
7031 enum isl_aux_usage src_aux_usage =
7032 anv_layout_to_aux_usage(&cmd_buffer->device->info,
7033 src_iview->image, aspect,
7034 VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
7035 layout);
7036
7037 enum isl_aux_usage dst_aux_usage =
7038 anv_layout_to_aux_usage(&cmd_buffer->device->info,
7039 dst_iview->image, aspect,
7040 VK_IMAGE_USAGE_TRANSFER_DST_BIT,
7041 att->resolve_layout);
7042
7043 enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
7044
7045 const VkRect2D render_area = gfx->render_area;
7046 if (gfx->view_mask == 0) {
7047 anv_image_msaa_resolve(cmd_buffer,
7048 src_iview->image, src_aux_usage,
7049 src_iview->planes[0].isl.base_level,
7050 src_iview->planes[0].isl.base_array_layer,
7051 dst_iview->image, dst_aux_usage,
7052 dst_iview->planes[0].isl.base_level,
7053 dst_iview->planes[0].isl.base_array_layer,
7054 aspect,
7055 render_area.offset.x, render_area.offset.y,
7056 render_area.offset.x, render_area.offset.y,
7057 render_area.extent.width,
7058 render_area.extent.height,
7059 gfx->layer_count, filter);
7060 } else {
7061 uint32_t res_view_mask = gfx->view_mask;
7062 while (res_view_mask) {
7063 int i = u_bit_scan(&res_view_mask);
7064
7065 anv_image_msaa_resolve(cmd_buffer,
7066 src_iview->image, src_aux_usage,
7067 src_iview->planes[0].isl.base_level,
7068 src_iview->planes[0].isl.base_array_layer + i,
7069 dst_iview->image, dst_aux_usage,
7070 dst_iview->planes[0].isl.base_level,
7071 dst_iview->planes[0].isl.base_array_layer + i,
7072 aspect,
7073 render_area.offset.x, render_area.offset.y,
7074 render_area.offset.x, render_area.offset.y,
7075 render_area.extent.width,
7076 render_area.extent.height,
7077 1, filter);
7078 }
7079 }
7080 }
7081
genX(CmdEndRendering)7082 void genX(CmdEndRendering)(
7083 VkCommandBuffer commandBuffer)
7084 {
7085 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7086 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
7087
7088 if (anv_batch_has_error(&cmd_buffer->batch))
7089 return;
7090
7091 const bool is_multiview = gfx->view_mask != 0;
7092 const uint32_t layers =
7093 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
7094
7095 bool has_color_resolve = false;
7096 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
7097 if (gfx->color_att[i].iview == NULL)
7098 continue;
7099
7100 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
7101 VK_IMAGE_ASPECT_COLOR_BIT);
7102
7103 /* Stash this off for later */
7104 if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
7105 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
7106 has_color_resolve = true;
7107 }
7108
7109 if (gfx->depth_att.iview != NULL) {
7110 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
7111 VK_IMAGE_ASPECT_DEPTH_BIT);
7112 }
7113
7114 if (gfx->stencil_att.iview != NULL) {
7115 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
7116 VK_IMAGE_ASPECT_STENCIL_BIT);
7117 }
7118
7119 if (has_color_resolve) {
7120 /* We are about to do some MSAA resolves. We need to flush so that the
7121 * result of writes to the MSAA color attachments show up in the sampler
7122 * when we blit to the single-sampled resolve target.
7123 */
7124 anv_add_pending_pipe_bits(cmd_buffer,
7125 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
7126 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
7127 "MSAA resolve");
7128 }
7129
7130 if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
7131 gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) {
7132 /* We are about to do some MSAA resolves. We need to flush so that the
7133 * result of writes to the MSAA depth attachments show up in the sampler
7134 * when we blit to the single-sampled resolve target.
7135 */
7136 anv_add_pending_pipe_bits(cmd_buffer,
7137 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
7138 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
7139 "MSAA resolve");
7140 }
7141
7142 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
7143 const struct anv_attachment *att = &gfx->color_att[i];
7144 if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
7145 (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
7146 continue;
7147
7148 cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
7149 VK_IMAGE_ASPECT_COLOR_BIT);
7150 }
7151
7152 if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
7153 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
7154 const struct anv_image_view *src_iview = gfx->depth_att.iview;
7155
7156 /* MSAA resolves sample from the source attachment. Transition the
7157 * depth attachment first to get rid of any HiZ that we may not be
7158 * able to handle.
7159 */
7160 transition_depth_buffer(cmd_buffer, src_iview->image,
7161 src_iview->planes[0].isl.base_array_layer,
7162 layers,
7163 gfx->depth_att.layout,
7164 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7165 false /* will_full_fast_clear */);
7166
7167 cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
7168 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7169 VK_IMAGE_ASPECT_DEPTH_BIT);
7170
7171 /* Transition the source back to the original layout. This seems a bit
7172 * inefficient but, since HiZ resolves aren't destructive, going from
7173 * less HiZ to more is generally a no-op.
7174 */
7175 transition_depth_buffer(cmd_buffer, src_iview->image,
7176 src_iview->planes[0].isl.base_array_layer,
7177 layers,
7178 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7179 gfx->depth_att.layout,
7180 false /* will_full_fast_clear */);
7181 }
7182
7183 if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
7184 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
7185 cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
7186 gfx->stencil_att.layout,
7187 VK_IMAGE_ASPECT_STENCIL_BIT);
7188 }
7189
7190 #if GFX_VER == 7
7191 /* On gfx7, we have to store a texturable version of the stencil buffer in
7192 * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
7193 * forth at strategic points. Stencil writes are only allowed in following
7194 * layouts:
7195 *
7196 * - VK_IMAGE_LAYOUT_GENERAL
7197 * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
7198 * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
7199 * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
7200 * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR
7201 *
7202 * For general, we have no nice opportunity to transition so we do the copy
7203 * to the shadow unconditionally at the end of the subpass. For transfer
7204 * destinations, we can update it as part of the transfer op. For the other
7205 * layouts, we delay the copy until a transition into some other layout.
7206 */
7207 if (gfx->stencil_att.iview != NULL) {
7208 const struct anv_image_view *iview = gfx->stencil_att.iview;
7209 const struct anv_image *image = iview->image;
7210 const uint32_t plane =
7211 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
7212
7213 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
7214 gfx->stencil_att.layout == VK_IMAGE_LAYOUT_GENERAL) {
7215 anv_image_copy_to_shadow(cmd_buffer, image,
7216 VK_IMAGE_ASPECT_STENCIL_BIT,
7217 iview->planes[plane].isl.base_level, 1,
7218 iview->planes[plane].isl.base_array_layer,
7219 layers);
7220 }
7221 }
7222 #endif
7223
7224 anv_cmd_buffer_reset_rendering(cmd_buffer);
7225 }
7226
7227 void
genX(cmd_emit_conditional_render_predicate)7228 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
7229 {
7230 #if GFX_VERx10 >= 75
7231 struct mi_builder b;
7232 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7233
7234 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
7235 mi_reg32(ANV_PREDICATE_RESULT_REG));
7236 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7237
7238 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
7239 mip.LoadOperation = LOAD_LOADINV;
7240 mip.CombineOperation = COMBINE_SET;
7241 mip.CompareOperation = COMPARE_SRCS_EQUAL;
7242 }
7243 #endif
7244 }
7245
7246 #if GFX_VERx10 >= 75
genX(CmdBeginConditionalRenderingEXT)7247 void genX(CmdBeginConditionalRenderingEXT)(
7248 VkCommandBuffer commandBuffer,
7249 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
7250 {
7251 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7252 ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
7253 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
7254 struct anv_address value_address =
7255 anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
7256
7257 const bool isInverted = pConditionalRenderingBegin->flags &
7258 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
7259
7260 cmd_state->conditional_render_enabled = true;
7261
7262 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7263
7264 struct mi_builder b;
7265 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7266
7267 /* Section 19.4 of the Vulkan 1.1.85 spec says:
7268 *
7269 * If the value of the predicate in buffer memory changes
7270 * while conditional rendering is active, the rendering commands
7271 * may be discarded in an implementation-dependent way.
7272 * Some implementations may latch the value of the predicate
7273 * upon beginning conditional rendering while others
7274 * may read it before every rendering command.
7275 *
7276 * So it's perfectly fine to read a value from the buffer once.
7277 */
7278 struct mi_value value = mi_mem32(value_address);
7279
7280 /* Precompute predicate result, it is necessary to support secondary
7281 * command buffers since it is unknown if conditional rendering is
7282 * inverted when populating them.
7283 */
7284 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
7285 isInverted ? mi_uge(&b, mi_imm(0), value) :
7286 mi_ult(&b, mi_imm(0), value));
7287 }
7288
genX(CmdEndConditionalRenderingEXT)7289 void genX(CmdEndConditionalRenderingEXT)(
7290 VkCommandBuffer commandBuffer)
7291 {
7292 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7293 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
7294
7295 cmd_state->conditional_render_enabled = false;
7296 }
7297 #endif
7298
7299 /* Set of stage bits for which are pipelined, i.e. they get queued
7300 * by the command streamer for later execution.
7301 */
7302 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
7303 ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | \
7304 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT_KHR | \
7305 VK_PIPELINE_STAGE_2_HOST_BIT_KHR | \
7306 VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
7307
genX(CmdSetEvent2KHR)7308 void genX(CmdSetEvent2KHR)(
7309 VkCommandBuffer commandBuffer,
7310 VkEvent _event,
7311 const VkDependencyInfoKHR* pDependencyInfo)
7312 {
7313 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7314 ANV_FROM_HANDLE(anv_event, event, _event);
7315
7316 VkPipelineStageFlags2KHR src_stages = 0;
7317
7318 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
7319 src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7320 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
7321 src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7322 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
7323 src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7324
7325 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
7326 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7327
7328 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
7329 if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
7330 pc.StallAtPixelScoreboard = true;
7331 pc.CommandStreamerStallEnable = true;
7332 }
7333
7334 pc.DestinationAddressType = DAT_PPGTT,
7335 pc.PostSyncOperation = WriteImmediateData,
7336 pc.Address = (struct anv_address) {
7337 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7338 event->state.offset
7339 };
7340 pc.ImmediateData = VK_EVENT_SET;
7341 anv_debug_dump_pc(pc);
7342 }
7343 }
7344
genX(CmdResetEvent2KHR)7345 void genX(CmdResetEvent2KHR)(
7346 VkCommandBuffer commandBuffer,
7347 VkEvent _event,
7348 VkPipelineStageFlags2KHR stageMask)
7349 {
7350 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7351 ANV_FROM_HANDLE(anv_event, event, _event);
7352
7353 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
7354 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7355
7356 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
7357 if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
7358 pc.StallAtPixelScoreboard = true;
7359 pc.CommandStreamerStallEnable = true;
7360 }
7361
7362 pc.DestinationAddressType = DAT_PPGTT;
7363 pc.PostSyncOperation = WriteImmediateData;
7364 pc.Address = (struct anv_address) {
7365 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7366 event->state.offset
7367 };
7368 pc.ImmediateData = VK_EVENT_RESET;
7369 anv_debug_dump_pc(pc);
7370 }
7371 }
7372
genX(CmdWaitEvents2KHR)7373 void genX(CmdWaitEvents2KHR)(
7374 VkCommandBuffer commandBuffer,
7375 uint32_t eventCount,
7376 const VkEvent* pEvents,
7377 const VkDependencyInfoKHR* pDependencyInfos)
7378 {
7379 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7380
7381 #if GFX_VER >= 8
7382 for (uint32_t i = 0; i < eventCount; i++) {
7383 ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
7384
7385 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
7386 sem.WaitMode = PollingMode,
7387 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD,
7388 sem.SemaphoreDataDword = VK_EVENT_SET,
7389 sem.SemaphoreAddress = (struct anv_address) {
7390 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7391 event->state.offset
7392 };
7393 }
7394 }
7395 #else
7396 anv_finishme("Implement events on gfx7");
7397 #endif
7398
7399 cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
7400 }
7401
genX(CmdSetPerformanceOverrideINTEL)7402 VkResult genX(CmdSetPerformanceOverrideINTEL)(
7403 VkCommandBuffer commandBuffer,
7404 const VkPerformanceOverrideInfoINTEL* pOverrideInfo)
7405 {
7406 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7407
7408 switch (pOverrideInfo->type) {
7409 case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
7410 #if GFX_VER >= 9
7411 anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
7412 csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
7413 csdm2.MediaInstructionDisable = pOverrideInfo->enable;
7414 csdm2._3DRenderingInstructionDisableMask = true;
7415 csdm2.MediaInstructionDisableMask = true;
7416 }
7417 #else
7418 anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
7419 instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
7420 instpm.MediaInstructionDisable = pOverrideInfo->enable;
7421 instpm._3DRenderingInstructionDisableMask = true;
7422 instpm.MediaInstructionDisableMask = true;
7423 }
7424 #endif
7425 break;
7426 }
7427
7428 case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
7429 if (pOverrideInfo->enable) {
7430 /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
7431 anv_add_pending_pipe_bits(cmd_buffer,
7432 ANV_PIPE_FLUSH_BITS |
7433 ANV_PIPE_INVALIDATE_BITS,
7434 "perf counter isolation");
7435 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7436 }
7437 break;
7438
7439 default:
7440 unreachable("Invalid override");
7441 }
7442
7443 return VK_SUCCESS;
7444 }
7445
genX(CmdSetPerformanceStreamMarkerINTEL)7446 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
7447 VkCommandBuffer commandBuffer,
7448 const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)
7449 {
7450 /* TODO: Waiting on the register to write, might depend on generation. */
7451
7452 return VK_SUCCESS;
7453 }
7454
7455 #define TIMESTAMP 0x2358
7456
genX(cmd_emit_timestamp)7457 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
7458 struct anv_device *device,
7459 struct anv_address addr,
7460 bool end_of_pipe) {
7461 if (end_of_pipe) {
7462 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
7463 pc.PostSyncOperation = WriteTimestamp;
7464 pc.Address = addr;
7465 anv_debug_dump_pc(pc);
7466 }
7467 } else {
7468 struct mi_builder b;
7469 mi_builder_init(&b, &device->info, batch);
7470 mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
7471 }
7472 }
7473