1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "radv_cs.h"
29 #include "radv_debug.h"
30 #include "radv_meta.h"
31 #include "radv_private.h"
32 #include "radv_radeon_winsys.h"
33 #include "radv_shader.h"
34 #include "sid.h"
35 #include "vk_format.h"
36 #include "vk_util.h"
37
38 #include "ac_debug.h"
39
40 #include "util/fast_idiv_by_const.h"
41
42 enum {
43 RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
44 RADV_PREFETCH_VS = (1 << 1),
45 RADV_PREFETCH_TCS = (1 << 2),
46 RADV_PREFETCH_TES = (1 << 3),
47 RADV_PREFETCH_GS = (1 << 4),
48 RADV_PREFETCH_PS = (1 << 5),
49 RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES |
50 RADV_PREFETCH_GS | RADV_PREFETCH_PS)
51 };
52
53 enum {
54 RADV_RT_STAGE_BITS = (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
55 VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR |
56 VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR)
57 };
58
59 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
60 struct radv_image *image, VkImageLayout src_layout,
61 bool src_render_loop, VkImageLayout dst_layout,
62 bool dst_render_loop, uint32_t src_family,
63 uint32_t dst_family, const VkImageSubresourceRange *range,
64 struct radv_sample_locations_state *sample_locs);
65
66 static void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size);
67
68 const struct radv_dynamic_state default_dynamic_state = {
69 .viewport =
70 {
71 .count = 0,
72 },
73 .scissor =
74 {
75 .count = 0,
76 },
77 .line_width = 1.0f,
78 .depth_bias =
79 {
80 .bias = 0.0f,
81 .clamp = 0.0f,
82 .slope = 0.0f,
83 },
84 .blend_constants = {0.0f, 0.0f, 0.0f, 0.0f},
85 .depth_bounds =
86 {
87 .min = 0.0f,
88 .max = 1.0f,
89 },
90 .stencil_compare_mask =
91 {
92 .front = ~0u,
93 .back = ~0u,
94 },
95 .stencil_write_mask =
96 {
97 .front = ~0u,
98 .back = ~0u,
99 },
100 .stencil_reference =
101 {
102 .front = 0u,
103 .back = 0u,
104 },
105 .line_stipple =
106 {
107 .factor = 0u,
108 .pattern = 0u,
109 },
110 .cull_mode = 0u,
111 .front_face = 0u,
112 .primitive_topology = 0u,
113 .fragment_shading_rate =
114 {
115 .size = {1u, 1u},
116 .combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR,
117 VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR},
118 },
119 .depth_bias_enable = 0u,
120 .primitive_restart_enable = 0u,
121 .rasterizer_discard_enable = 0u,
122 .logic_op = 0u,
123 .color_write_enable = 0xffffffffu,
124 };
125
126 static void
radv_bind_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_dynamic_state * src)127 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
128 {
129 struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
130 uint64_t copy_mask = src->mask;
131 uint64_t dest_mask = 0;
132
133 dest->discard_rectangle.count = src->discard_rectangle.count;
134 dest->sample_location.count = src->sample_location.count;
135
136 if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
137 if (dest->viewport.count != src->viewport.count) {
138 dest->viewport.count = src->viewport.count;
139 dest_mask |= RADV_DYNAMIC_VIEWPORT;
140 }
141
142 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
143 src->viewport.count * sizeof(VkViewport))) {
144 typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count);
145 typed_memcpy(dest->viewport.xform, src->viewport.xform, src->viewport.count);
146 dest_mask |= RADV_DYNAMIC_VIEWPORT;
147 }
148 }
149
150 if (copy_mask & RADV_DYNAMIC_SCISSOR) {
151 if (dest->scissor.count != src->scissor.count) {
152 dest->scissor.count = src->scissor.count;
153 dest_mask |= RADV_DYNAMIC_SCISSOR;
154 }
155
156 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
157 src->scissor.count * sizeof(VkRect2D))) {
158 typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count);
159 dest_mask |= RADV_DYNAMIC_SCISSOR;
160 }
161 }
162
163 if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
164 if (dest->line_width != src->line_width) {
165 dest->line_width = src->line_width;
166 dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
167 }
168 }
169
170 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
171 if (memcmp(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias))) {
172 dest->depth_bias = src->depth_bias;
173 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
174 }
175 }
176
177 if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
178 if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) {
179 typed_memcpy(dest->blend_constants, src->blend_constants, 4);
180 dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
181 }
182 }
183
184 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
185 if (memcmp(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds))) {
186 dest->depth_bounds = src->depth_bounds;
187 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
188 }
189 }
190
191 if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
192 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
193 sizeof(src->stencil_compare_mask))) {
194 dest->stencil_compare_mask = src->stencil_compare_mask;
195 dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
196 }
197 }
198
199 if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
200 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
201 sizeof(src->stencil_write_mask))) {
202 dest->stencil_write_mask = src->stencil_write_mask;
203 dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
204 }
205 }
206
207 if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
208 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
209 sizeof(src->stencil_reference))) {
210 dest->stencil_reference = src->stencil_reference;
211 dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
212 }
213 }
214
215 if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
216 if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
217 src->discard_rectangle.count * sizeof(VkRect2D))) {
218 typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles,
219 src->discard_rectangle.count);
220 dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
221 }
222 }
223
224 if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
225 if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
226 dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
227 dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
228 memcmp(&dest->sample_location.locations, &src->sample_location.locations,
229 src->sample_location.count * sizeof(VkSampleLocationEXT))) {
230 dest->sample_location.per_pixel = src->sample_location.per_pixel;
231 dest->sample_location.grid_size = src->sample_location.grid_size;
232 typed_memcpy(dest->sample_location.locations, src->sample_location.locations,
233 src->sample_location.count);
234 dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
235 }
236 }
237
238 if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) {
239 if (memcmp(&dest->line_stipple, &src->line_stipple, sizeof(src->line_stipple))) {
240 dest->line_stipple = src->line_stipple;
241 dest_mask |= RADV_DYNAMIC_LINE_STIPPLE;
242 }
243 }
244
245 if (copy_mask & RADV_DYNAMIC_CULL_MODE) {
246 if (dest->cull_mode != src->cull_mode) {
247 dest->cull_mode = src->cull_mode;
248 dest_mask |= RADV_DYNAMIC_CULL_MODE;
249 }
250 }
251
252 if (copy_mask & RADV_DYNAMIC_FRONT_FACE) {
253 if (dest->front_face != src->front_face) {
254 dest->front_face = src->front_face;
255 dest_mask |= RADV_DYNAMIC_FRONT_FACE;
256 }
257 }
258
259 if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
260 if (dest->primitive_topology != src->primitive_topology) {
261 dest->primitive_topology = src->primitive_topology;
262 dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
263 }
264 }
265
266 if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
267 if (dest->depth_test_enable != src->depth_test_enable) {
268 dest->depth_test_enable = src->depth_test_enable;
269 dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE;
270 }
271 }
272
273 if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
274 if (dest->depth_write_enable != src->depth_write_enable) {
275 dest->depth_write_enable = src->depth_write_enable;
276 dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
277 }
278 }
279
280 if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
281 if (dest->depth_compare_op != src->depth_compare_op) {
282 dest->depth_compare_op = src->depth_compare_op;
283 dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP;
284 }
285 }
286
287 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
288 if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) {
289 dest->depth_bounds_test_enable = src->depth_bounds_test_enable;
290 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
291 }
292 }
293
294 if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
295 if (dest->stencil_test_enable != src->stencil_test_enable) {
296 dest->stencil_test_enable = src->stencil_test_enable;
297 dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE;
298 }
299 }
300
301 if (copy_mask & RADV_DYNAMIC_STENCIL_OP) {
302 if (memcmp(&dest->stencil_op, &src->stencil_op, sizeof(src->stencil_op))) {
303 dest->stencil_op = src->stencil_op;
304 dest_mask |= RADV_DYNAMIC_STENCIL_OP;
305 }
306 }
307
308 if (copy_mask & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
309 if (memcmp(&dest->fragment_shading_rate, &src->fragment_shading_rate,
310 sizeof(src->fragment_shading_rate))) {
311 dest->fragment_shading_rate = src->fragment_shading_rate;
312 dest_mask |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
313 }
314 }
315
316 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
317 if (dest->depth_bias_enable != src->depth_bias_enable) {
318 dest->depth_bias_enable = src->depth_bias_enable;
319 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
320 }
321 }
322
323 if (copy_mask & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
324 if (dest->primitive_restart_enable != src->primitive_restart_enable) {
325 dest->primitive_restart_enable = src->primitive_restart_enable;
326 dest_mask |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
327 }
328 }
329
330 if (copy_mask & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
331 if (dest->rasterizer_discard_enable != src->rasterizer_discard_enable) {
332 dest->rasterizer_discard_enable = src->rasterizer_discard_enable;
333 dest_mask |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
334 }
335 }
336
337 if (copy_mask & RADV_DYNAMIC_LOGIC_OP) {
338 if (dest->logic_op != src->logic_op) {
339 dest->logic_op = src->logic_op;
340 dest_mask |= RADV_DYNAMIC_LOGIC_OP;
341 }
342 }
343
344 if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
345 if (dest->color_write_enable != src->color_write_enable) {
346 dest->color_write_enable = src->color_write_enable;
347 dest_mask |= RADV_DYNAMIC_COLOR_WRITE_ENABLE;
348 }
349 }
350
351 cmd_buffer->state.dirty |= dest_mask;
352 }
353
354 static void
radv_bind_streamout_state(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)355 radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
356 {
357 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
358 struct radv_shader_info *info;
359
360 if (!pipeline->streamout_shader || cmd_buffer->device->physical_device->use_ngg_streamout)
361 return;
362
363 info = &pipeline->streamout_shader->info;
364 for (int i = 0; i < MAX_SO_BUFFERS; i++)
365 so->stride_in_dw[i] = info->so.strides[i];
366
367 so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
368 }
369
370 bool
radv_cmd_buffer_uses_mec(struct radv_cmd_buffer * cmd_buffer)371 radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
372 {
373 return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
374 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
375 }
376
377 enum ring_type
radv_queue_family_to_ring(int f)378 radv_queue_family_to_ring(int f)
379 {
380 switch (f) {
381 case RADV_QUEUE_GENERAL:
382 return RING_GFX;
383 case RADV_QUEUE_COMPUTE:
384 return RING_COMPUTE;
385 case RADV_QUEUE_TRANSFER:
386 return RING_DMA;
387 default:
388 unreachable("Unknown queue family");
389 }
390 }
391
392 static void
radv_emit_write_data_packet(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned count,const uint32_t * data)393 radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
394 unsigned count, const uint32_t *data)
395 {
396 struct radeon_cmdbuf *cs = cmd_buffer->cs;
397
398 radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
399
400 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
401 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel));
402 radeon_emit(cs, va);
403 radeon_emit(cs, va >> 32);
404 radeon_emit_array(cs, data, count);
405 }
406
407 static void
radv_emit_clear_data(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned size)408 radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
409 unsigned size)
410 {
411 uint32_t *zeroes = alloca(size);
412 memset(zeroes, 0, size);
413 radv_emit_write_data_packet(cmd_buffer, engine_sel, va, size / 4, zeroes);
414 }
415
416 static void
radv_destroy_cmd_buffer(struct radv_cmd_buffer * cmd_buffer)417 radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
418 {
419 list_del(&cmd_buffer->pool_link);
420
421 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
422 {
423 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
424 list_del(&up->list);
425 free(up);
426 }
427
428 if (cmd_buffer->upload.upload_bo)
429 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo);
430
431 if (cmd_buffer->cs)
432 cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
433
434 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
435 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
436 vk_object_base_finish(&cmd_buffer->descriptors[i].push_set.set.base);
437 }
438
439 vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
440
441 vk_command_buffer_finish(&cmd_buffer->vk);
442 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
443 }
444
445 static VkResult
radv_create_cmd_buffer(struct radv_device * device,struct radv_cmd_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)446 radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
447 VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer)
448 {
449 struct radv_cmd_buffer *cmd_buffer;
450 unsigned ring;
451 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
452 if (cmd_buffer == NULL)
453 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
454
455 VkResult result =
456 vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
457 if (result != VK_SUCCESS) {
458 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
459 return result;
460 }
461
462 cmd_buffer->device = device;
463 cmd_buffer->pool = pool;
464 cmd_buffer->level = level;
465
466 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
467 cmd_buffer->queue_family_index = pool->queue_family_index;
468
469 ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
470
471 cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
472 if (!cmd_buffer->cs) {
473 radv_destroy_cmd_buffer(cmd_buffer);
474 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
475 }
476
477 vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base,
478 VK_OBJECT_TYPE_DESCRIPTOR_SET);
479
480 for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
481 vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base,
482 VK_OBJECT_TYPE_DESCRIPTOR_SET);
483
484 *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
485
486 list_inithead(&cmd_buffer->upload.list);
487
488 return VK_SUCCESS;
489 }
490
491 static VkResult
radv_reset_cmd_buffer(struct radv_cmd_buffer * cmd_buffer)492 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
493 {
494 vk_command_buffer_reset(&cmd_buffer->vk);
495
496 cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
497
498 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
499 {
500 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
501 list_del(&up->list);
502 free(up);
503 }
504
505 cmd_buffer->push_constant_stages = 0;
506 cmd_buffer->scratch_size_per_wave_needed = 0;
507 cmd_buffer->scratch_waves_wanted = 0;
508 cmd_buffer->compute_scratch_size_per_wave_needed = 0;
509 cmd_buffer->compute_scratch_waves_wanted = 0;
510 cmd_buffer->esgs_ring_size_needed = 0;
511 cmd_buffer->gsvs_ring_size_needed = 0;
512 cmd_buffer->tess_rings_needed = false;
513 cmd_buffer->gds_needed = false;
514 cmd_buffer->gds_oa_needed = false;
515 cmd_buffer->sample_positions_needed = false;
516
517 if (cmd_buffer->upload.upload_bo)
518 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
519 cmd_buffer->upload.offset = 0;
520
521 cmd_buffer->record_result = VK_SUCCESS;
522
523 memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
524
525 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
526 cmd_buffer->descriptors[i].dirty = 0;
527 cmd_buffer->descriptors[i].valid = 0;
528 cmd_buffer->descriptors[i].push_dirty = false;
529 }
530
531 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
532 cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
533 unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
534 unsigned fence_offset, eop_bug_offset;
535 void *fence_ptr;
536
537 radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
538 memset(fence_ptr, 0, 8);
539
540 cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
541 cmd_buffer->gfx9_fence_va += fence_offset;
542
543 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
544
545 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
546 /* Allocate a buffer for the EOP bug on GFX9. */
547 radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
548 memset(fence_ptr, 0, 16 * num_db);
549 cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
550 cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
551
552 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
553 }
554 }
555
556 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
557
558 return cmd_buffer->record_result;
559 }
560
561 static bool
radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer * cmd_buffer,uint64_t min_needed)562 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
563 {
564 uint64_t new_size;
565 struct radeon_winsys_bo *bo = NULL;
566 struct radv_cmd_buffer_upload *upload;
567 struct radv_device *device = cmd_buffer->device;
568
569 new_size = MAX2(min_needed, 16 * 1024);
570 new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
571
572 VkResult result =
573 device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws),
574 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
575 RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
576 RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo);
577
578 if (result != VK_SUCCESS) {
579 cmd_buffer->record_result = result;
580 return false;
581 }
582
583 radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
584 if (cmd_buffer->upload.upload_bo) {
585 upload = malloc(sizeof(*upload));
586
587 if (!upload) {
588 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
589 device->ws->buffer_destroy(device->ws, bo);
590 return false;
591 }
592
593 memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
594 list_add(&upload->list, &cmd_buffer->upload.list);
595 }
596
597 cmd_buffer->upload.upload_bo = bo;
598 cmd_buffer->upload.size = new_size;
599 cmd_buffer->upload.offset = 0;
600 cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
601
602 if (!cmd_buffer->upload.map) {
603 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
604 return false;
605 }
606
607 return true;
608 }
609
610 bool
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned * out_offset,void ** ptr)611 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
612 unsigned *out_offset, void **ptr)
613 {
614 assert(size % 4 == 0);
615
616 struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
617
618 /* Align to the scalar cache line size if it results in this allocation
619 * being placed in less of them.
620 */
621 unsigned offset = cmd_buffer->upload.offset;
622 unsigned line_size = rad_info->chip_class >= GFX10 ? 64 : 32;
623 unsigned gap = align(offset, line_size) - offset;
624 if ((size & (line_size - 1)) > gap)
625 offset = align(offset, line_size);
626
627 if (offset + size > cmd_buffer->upload.size) {
628 if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
629 return false;
630 offset = 0;
631 }
632
633 *out_offset = offset;
634 *ptr = cmd_buffer->upload.map + offset;
635
636 cmd_buffer->upload.offset = offset + size;
637 return true;
638 }
639
640 bool
radv_cmd_buffer_upload_data(struct radv_cmd_buffer * cmd_buffer,unsigned size,const void * data,unsigned * out_offset)641 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data,
642 unsigned *out_offset)
643 {
644 uint8_t *ptr;
645
646 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
647 return false;
648
649 if (ptr)
650 memcpy(ptr, data, size);
651
652 return true;
653 }
654
655 void
radv_cmd_buffer_trace_emit(struct radv_cmd_buffer * cmd_buffer)656 radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
657 {
658 struct radv_device *device = cmd_buffer->device;
659 struct radeon_cmdbuf *cs = cmd_buffer->cs;
660 uint64_t va;
661
662 va = radv_buffer_get_va(device->trace_bo);
663 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
664 va += 4;
665
666 ++cmd_buffer->state.trace_id;
667 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id);
668
669 radeon_check_space(cmd_buffer->device->ws, cs, 2);
670
671 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
672 radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
673 }
674
675 static void
radv_cmd_buffer_after_draw(struct radv_cmd_buffer * cmd_buffer,enum radv_cmd_flush_bits flags)676 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags)
677 {
678 if (unlikely(cmd_buffer->device->thread_trace.bo)) {
679 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
680 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
681 }
682
683 if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
684 enum rgp_flush_bits sqtt_flush_bits = 0;
685 assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
686
687 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
688
689 /* Force wait for graphics or compute engines to be idle. */
690 si_cs_emit_cache_flush(cmd_buffer->cs,
691 cmd_buffer->device->physical_device->rad_info.chip_class,
692 &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va,
693 radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits,
694 cmd_buffer->gfx9_eop_bug_va);
695 }
696
697 if (unlikely(cmd_buffer->device->trace_bo))
698 radv_cmd_buffer_trace_emit(cmd_buffer);
699 }
700
701 static void
radv_save_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)702 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
703 {
704 struct radv_device *device = cmd_buffer->device;
705 enum ring_type ring;
706 uint32_t data[2];
707 uint64_t va;
708
709 va = radv_buffer_get_va(device->trace_bo);
710
711 ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
712
713 switch (ring) {
714 case RING_GFX:
715 va += 8;
716 break;
717 case RING_COMPUTE:
718 va += 16;
719 break;
720 default:
721 assert(!"invalid ring type");
722 }
723
724 uint64_t pipeline_address = (uintptr_t)pipeline;
725 data[0] = pipeline_address;
726 data[1] = pipeline_address >> 32;
727
728 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
729 }
730
731 static void
radv_save_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,uint64_t vb_ptr)732 radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
733 {
734 struct radv_device *device = cmd_buffer->device;
735 uint32_t data[2];
736 uint64_t va;
737
738 va = radv_buffer_get_va(device->trace_bo);
739 va += 24;
740
741 data[0] = vb_ptr;
742 data[1] = vb_ptr >> 32;
743
744 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
745 }
746
747 void
radv_set_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)748 radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
749 struct radv_descriptor_set *set, unsigned idx)
750 {
751 struct radv_descriptor_state *descriptors_state =
752 radv_get_descriptors_state(cmd_buffer, bind_point);
753
754 descriptors_state->sets[idx] = set;
755
756 descriptors_state->valid |= (1u << idx); /* active descriptors */
757 descriptors_state->dirty |= (1u << idx);
758 }
759
760 static void
radv_save_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)761 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
762 {
763 struct radv_descriptor_state *descriptors_state =
764 radv_get_descriptors_state(cmd_buffer, bind_point);
765 struct radv_device *device = cmd_buffer->device;
766 uint32_t data[MAX_SETS * 2] = {0};
767 uint64_t va;
768 va = radv_buffer_get_va(device->trace_bo) + 32;
769
770 u_foreach_bit(i, descriptors_state->valid)
771 {
772 struct radv_descriptor_set *set = descriptors_state->sets[i];
773 data[i * 2] = (uint64_t)(uintptr_t)set;
774 data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
775 }
776
777 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data);
778 }
779
780 struct radv_userdata_info *
radv_lookup_user_sgpr(struct radv_pipeline * pipeline,gl_shader_stage stage,int idx)781 radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx)
782 {
783 struct radv_shader_variant *shader = radv_get_shader(pipeline, stage);
784 return &shader->info.user_sgprs_locs.shader_data[idx];
785 }
786
787 static void
radv_emit_userdata_address(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,gl_shader_stage stage,int idx,uint64_t va)788 radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
789 gl_shader_stage stage, int idx, uint64_t va)
790 {
791 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
792 uint32_t base_reg = pipeline->user_data_0[stage];
793 if (loc->sgpr_idx == -1)
794 return;
795
796 assert(loc->num_sgprs == 1);
797
798 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
799 false);
800 }
801
802 static void
radv_emit_descriptor_pointers(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,struct radv_descriptor_state * descriptors_state,gl_shader_stage stage)803 radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
804 struct radv_descriptor_state *descriptors_state,
805 gl_shader_stage stage)
806 {
807 struct radv_device *device = cmd_buffer->device;
808 struct radeon_cmdbuf *cs = cmd_buffer->cs;
809 uint32_t sh_base = pipeline->user_data_0[stage];
810 struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs;
811 unsigned mask = locs->descriptor_sets_enabled;
812
813 mask &= descriptors_state->dirty & descriptors_state->valid;
814
815 while (mask) {
816 int start, count;
817
818 u_bit_scan_consecutive_range(&mask, &start, &count);
819
820 struct radv_userdata_info *loc = &locs->descriptor_sets[start];
821 unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
822
823 radv_emit_shader_pointer_head(cs, sh_offset, count, true);
824 for (int i = 0; i < count; i++) {
825 struct radv_descriptor_set *set = descriptors_state->sets[start + i];
826
827 radv_emit_shader_pointer_body(device, cs, set->header.va, true);
828 }
829 }
830 }
831
832 /**
833 * Convert the user sample locations to hardware sample locations (the values
834 * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
835 */
836 static void
radv_convert_user_sample_locs(struct radv_sample_locations_state * state,uint32_t x,uint32_t y,VkOffset2D * sample_locs)837 radv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
838 VkOffset2D *sample_locs)
839 {
840 uint32_t x_offset = x % state->grid_size.width;
841 uint32_t y_offset = y % state->grid_size.height;
842 uint32_t num_samples = (uint32_t)state->per_pixel;
843 VkSampleLocationEXT *user_locs;
844 uint32_t pixel_offset;
845
846 pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
847
848 assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
849 user_locs = &state->locations[pixel_offset];
850
851 for (uint32_t i = 0; i < num_samples; i++) {
852 float shifted_pos_x = user_locs[i].x - 0.5;
853 float shifted_pos_y = user_locs[i].y - 0.5;
854
855 int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
856 int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
857
858 sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
859 sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
860 }
861 }
862
863 /**
864 * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
865 * locations.
866 */
867 static void
radv_compute_sample_locs_pixel(uint32_t num_samples,VkOffset2D * sample_locs,uint32_t * sample_locs_pixel)868 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
869 uint32_t *sample_locs_pixel)
870 {
871 for (uint32_t i = 0; i < num_samples; i++) {
872 uint32_t sample_reg_idx = i / 4;
873 uint32_t sample_loc_idx = i % 4;
874 int32_t pos_x = sample_locs[i].x;
875 int32_t pos_y = sample_locs[i].y;
876
877 uint32_t shift_x = 8 * sample_loc_idx;
878 uint32_t shift_y = shift_x + 4;
879
880 sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
881 sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
882 }
883 }
884
885 /**
886 * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
887 * sample locations.
888 */
889 static uint64_t
radv_compute_centroid_priority(struct radv_cmd_buffer * cmd_buffer,VkOffset2D * sample_locs,uint32_t num_samples)890 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs,
891 uint32_t num_samples)
892 {
893 uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
894 uint32_t sample_mask = num_samples - 1;
895 uint32_t *distances = alloca(num_samples * sizeof(*distances));
896 uint64_t centroid_priority = 0;
897
898 /* Compute the distances from center for each sample. */
899 for (int i = 0; i < num_samples; i++) {
900 distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
901 }
902
903 /* Compute the centroid priorities by looking at the distances array. */
904 for (int i = 0; i < num_samples; i++) {
905 uint32_t min_idx = 0;
906
907 for (int j = 1; j < num_samples; j++) {
908 if (distances[j] < distances[min_idx])
909 min_idx = j;
910 }
911
912 centroid_priorities[i] = min_idx;
913 distances[min_idx] = 0xffffffff;
914 }
915
916 /* Compute the final centroid priority. */
917 for (int i = 0; i < 8; i++) {
918 centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
919 }
920
921 return centroid_priority << 32 | centroid_priority;
922 }
923
924 /**
925 * Emit the sample locations that are specified with VK_EXT_sample_locations.
926 */
927 static void
radv_emit_sample_locations(struct radv_cmd_buffer * cmd_buffer)928 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
929 {
930 struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location;
931 uint32_t num_samples = (uint32_t)sample_location->per_pixel;
932 struct radeon_cmdbuf *cs = cmd_buffer->cs;
933 uint32_t sample_locs_pixel[4][2] = {0};
934 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
935 uint32_t max_sample_dist = 0;
936 uint64_t centroid_priority;
937
938 if (!cmd_buffer->state.dynamic.sample_location.count)
939 return;
940
941 /* Convert the user sample locations to hardware sample locations. */
942 radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
943 radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
944 radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
945 radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
946
947 /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
948 for (uint32_t i = 0; i < 4; i++) {
949 radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
950 }
951
952 /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
953 centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
954
955 /* Compute the maximum sample distance from the specified locations. */
956 for (unsigned i = 0; i < 4; ++i) {
957 for (uint32_t j = 0; j < num_samples; j++) {
958 VkOffset2D offset = sample_locs[i][j];
959 max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
960 }
961 }
962
963 /* Emit the specified user sample locations. */
964 switch (num_samples) {
965 case 2:
966 case 4:
967 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
968 sample_locs_pixel[0][0]);
969 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
970 sample_locs_pixel[1][0]);
971 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
972 sample_locs_pixel[2][0]);
973 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
974 sample_locs_pixel[3][0]);
975 break;
976 case 8:
977 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
978 sample_locs_pixel[0][0]);
979 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
980 sample_locs_pixel[1][0]);
981 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
982 sample_locs_pixel[2][0]);
983 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
984 sample_locs_pixel[3][0]);
985 radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1,
986 sample_locs_pixel[0][1]);
987 radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1,
988 sample_locs_pixel[1][1]);
989 radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1,
990 sample_locs_pixel[2][1]);
991 radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1,
992 sample_locs_pixel[3][1]);
993 break;
994 default:
995 unreachable("invalid number of samples");
996 }
997
998 /* Emit the maximum sample distance and the centroid priority. */
999 radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG,
1000 S_028BE0_MAX_SAMPLE_DIST(max_sample_dist), ~C_028BE0_MAX_SAMPLE_DIST);
1001
1002 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1003 radeon_emit(cs, centroid_priority);
1004 radeon_emit(cs, centroid_priority >> 32);
1005
1006 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1007 }
1008
1009 static void
radv_emit_inline_push_consts(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,gl_shader_stage stage,int idx,uint32_t * values)1010 radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
1011 gl_shader_stage stage, int idx, uint32_t *values)
1012 {
1013 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
1014 uint32_t base_reg = pipeline->user_data_0[stage];
1015 if (loc->sgpr_idx == -1)
1016 return;
1017
1018 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 2 + loc->num_sgprs);
1019
1020 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
1021 radeon_emit_array(cmd_buffer->cs, values, loc->num_sgprs);
1022 }
1023
1024 static void
radv_update_multisample_state(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)1025 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
1026 {
1027 int num_samples = pipeline->graphics.ms.num_samples;
1028 struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
1029
1030 if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions)
1031 cmd_buffer->sample_positions_needed = true;
1032
1033 if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
1034 return;
1035
1036 radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
1037
1038 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1039 }
1040
1041 static void
radv_update_binning_state(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)1042 radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
1043 {
1044 const struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
1045
1046 if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
1047 return;
1048
1049 if (old_pipeline &&
1050 old_pipeline->graphics.binning.pa_sc_binner_cntl_0 ==
1051 pipeline->graphics.binning.pa_sc_binner_cntl_0)
1052 return;
1053
1054 bool binning_flush = false;
1055 if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
1056 cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
1057 cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
1058 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1059 binning_flush = !old_pipeline ||
1060 G_028C44_BINNING_MODE(old_pipeline->graphics.binning.pa_sc_binner_cntl_0) !=
1061 G_028C44_BINNING_MODE(pipeline->graphics.binning.pa_sc_binner_cntl_0);
1062 }
1063
1064 radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
1065 pipeline->graphics.binning.pa_sc_binner_cntl_0 |
1066 S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
1067
1068 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1069 }
1070
1071 static void
radv_emit_shader_prefetch(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_variant * shader)1072 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *shader)
1073 {
1074 uint64_t va;
1075
1076 if (!shader)
1077 return;
1078
1079 va = radv_shader_variant_get_va(shader);
1080
1081 si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1082 }
1083
1084 static void
radv_emit_prefetch_L2(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,bool vertex_stage_only)1085 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
1086 bool vertex_stage_only)
1087 {
1088 struct radv_cmd_state *state = &cmd_buffer->state;
1089 uint32_t mask = state->prefetch_L2_mask;
1090
1091 if (vertex_stage_only) {
1092 /* Fast prefetch path for starting draws as soon as possible.
1093 */
1094 mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS);
1095 }
1096
1097 if (mask & RADV_PREFETCH_VS)
1098 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_VERTEX]);
1099
1100 if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1101 si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size);
1102
1103 if (mask & RADV_PREFETCH_TCS)
1104 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_CTRL]);
1105
1106 if (mask & RADV_PREFETCH_TES)
1107 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_EVAL]);
1108
1109 if (mask & RADV_PREFETCH_GS) {
1110 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_GEOMETRY]);
1111 if (radv_pipeline_has_gs_copy_shader(pipeline))
1112 radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader);
1113 }
1114
1115 if (mask & RADV_PREFETCH_PS)
1116 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_FRAGMENT]);
1117
1118 state->prefetch_L2_mask &= ~mask;
1119 }
1120
1121 static void
radv_emit_rbplus_state(struct radv_cmd_buffer * cmd_buffer)1122 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1123 {
1124 if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
1125 return;
1126
1127 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1128 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1129
1130 unsigned sx_ps_downconvert = 0;
1131 unsigned sx_blend_opt_epsilon = 0;
1132 unsigned sx_blend_opt_control = 0;
1133
1134 if (!cmd_buffer->state.attachments || !subpass)
1135 return;
1136
1137 for (unsigned i = 0; i < subpass->color_count; ++i) {
1138 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1139 /* We don't set the DISABLE bits, because the HW can't have holes,
1140 * so the SPI color format is set to 32-bit 1-component. */
1141 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1142 continue;
1143 }
1144
1145 int idx = subpass->color_attachments[i].attachment;
1146 struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
1147
1148 unsigned format = G_028C70_FORMAT(cb->cb_color_info);
1149 unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1150 uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf;
1151 uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf;
1152
1153 bool has_alpha, has_rgb;
1154
1155 /* Set if RGB and A are present. */
1156 has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib);
1157
1158 if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1159 has_rgb = !has_alpha;
1160 else
1161 has_rgb = true;
1162
1163 /* Check the colormask and export format. */
1164 if (!(colormask & 0x7))
1165 has_rgb = false;
1166 if (!(colormask & 0x8))
1167 has_alpha = false;
1168
1169 if (spi_format == V_028714_SPI_SHADER_ZERO) {
1170 has_rgb = false;
1171 has_alpha = false;
1172 }
1173
1174 /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1175 * optimization, even though it has no alpha. */
1176 if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1177 has_alpha = true;
1178
1179 /* Disable value checking for disabled channels. */
1180 if (!has_rgb)
1181 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1182 if (!has_alpha)
1183 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1184
1185 /* Enable down-conversion for 32bpp and smaller formats. */
1186 switch (format) {
1187 case V_028C70_COLOR_8:
1188 case V_028C70_COLOR_8_8:
1189 case V_028C70_COLOR_8_8_8_8:
1190 /* For 1 and 2-channel formats, use the superset thereof. */
1191 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1192 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1193 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1194 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1195 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1196 }
1197 break;
1198
1199 case V_028C70_COLOR_5_6_5:
1200 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1201 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1202 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1203 }
1204 break;
1205
1206 case V_028C70_COLOR_1_5_5_5:
1207 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1208 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1209 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1210 }
1211 break;
1212
1213 case V_028C70_COLOR_4_4_4_4:
1214 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1215 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1216 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1217 }
1218 break;
1219
1220 case V_028C70_COLOR_32:
1221 if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1222 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1223 else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1224 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1225 break;
1226
1227 case V_028C70_COLOR_16:
1228 case V_028C70_COLOR_16_16:
1229 /* For 1-channel formats, use the superset thereof. */
1230 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1231 spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1232 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1233 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1234 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1235 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1236 else
1237 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1238 }
1239 break;
1240
1241 case V_028C70_COLOR_10_11_11:
1242 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1243 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1244 break;
1245
1246 case V_028C70_COLOR_2_10_10_10:
1247 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1248 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1249 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1250 }
1251 break;
1252 case V_028C70_COLOR_5_9_9_9:
1253 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1254 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1255 break;
1256 }
1257 }
1258
1259 /* Do not set the DISABLE bits for the unused attachments, as that
1260 * breaks dual source blending in SkQP and does not seem to improve
1261 * performance. */
1262
1263 if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert &&
1264 sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon &&
1265 sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control)
1266 return;
1267
1268 radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1269 radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1270 radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1271 radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1272
1273 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1274
1275 cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
1276 cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
1277 cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
1278 }
1279
1280 static void
radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer * cmd_buffer)1281 radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer)
1282 {
1283 if (!cmd_buffer->device->pbb_allowed)
1284 return;
1285
1286 struct radv_binning_settings settings =
1287 radv_get_binning_settings(cmd_buffer->device->physical_device);
1288 bool break_for_new_ps =
1289 (!cmd_buffer->state.emitted_pipeline ||
1290 cmd_buffer->state.emitted_pipeline->shaders[MESA_SHADER_FRAGMENT] !=
1291 cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) &&
1292 (settings.context_states_per_bin > 1 || settings.persistent_states_per_bin > 1);
1293 bool break_for_new_cb_target_mask =
1294 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) &&
1295 settings.context_states_per_bin > 1;
1296
1297 if (!break_for_new_ps && !break_for_new_cb_target_mask)
1298 return;
1299
1300 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1301 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1302 }
1303
1304 static void
radv_emit_graphics_pipeline(struct radv_cmd_buffer * cmd_buffer)1305 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1306 {
1307 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1308
1309 if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
1310 return;
1311
1312 radv_update_multisample_state(cmd_buffer, pipeline);
1313 radv_update_binning_state(cmd_buffer, pipeline);
1314
1315 cmd_buffer->scratch_size_per_wave_needed =
1316 MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave);
1317 cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->max_waves);
1318
1319 if (!cmd_buffer->state.emitted_pipeline ||
1320 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
1321 pipeline->graphics.can_use_guardband)
1322 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
1323
1324 if (!cmd_buffer->state.emitted_pipeline ||
1325 cmd_buffer->state.emitted_pipeline->graphics.pa_su_sc_mode_cntl !=
1326 pipeline->graphics.pa_su_sc_mode_cntl)
1327 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE |
1328 RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
1329 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
1330
1331 if (!cmd_buffer->state.emitted_pipeline ||
1332 cmd_buffer->state.emitted_pipeline->graphics.pa_cl_clip_cntl !=
1333 pipeline->graphics.pa_cl_clip_cntl)
1334 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1335
1336 if (!cmd_buffer->state.emitted_pipeline ||
1337 cmd_buffer->state.emitted_pipeline->graphics.cb_color_control !=
1338 pipeline->graphics.cb_color_control)
1339 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
1340
1341 if (!cmd_buffer->state.emitted_pipeline)
1342 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
1343 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
1344 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |
1345 RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
1346
1347 if (!cmd_buffer->state.emitted_pipeline ||
1348 cmd_buffer->state.emitted_pipeline->graphics.db_depth_control !=
1349 pipeline->graphics.db_depth_control)
1350 cmd_buffer->state.dirty |=
1351 RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
1352 RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
1353 RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1354
1355 if (!cmd_buffer->state.emitted_pipeline)
1356 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1357
1358 if (!cmd_buffer->state.emitted_pipeline ||
1359 cmd_buffer->state.emitted_pipeline->graphics.cb_target_mask !=
1360 pipeline->graphics.cb_target_mask) {
1361 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
1362 }
1363
1364 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
1365
1366 if (pipeline->graphics.has_ngg_culling &&
1367 pipeline->graphics.last_vgt_api_stage != MESA_SHADER_GEOMETRY &&
1368 !cmd_buffer->state.last_nggc_settings) {
1369 /* The already emitted RSRC2 contains the LDS required for NGG culling.
1370 * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage.
1371 * API GS always needs LDS, so this isn't useful there.
1372 */
1373 struct radv_shader_variant *v = pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
1374 radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
1375 (v->config.rsrc2 & C_00B22C_LDS_SIZE) |
1376 S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling));
1377 }
1378
1379 if (!cmd_buffer->state.emitted_pipeline ||
1380 cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
1381 cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
1382 memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, pipeline->ctx_cs.buf,
1383 pipeline->ctx_cs.cdw * 4)) {
1384 radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw);
1385 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1386 }
1387
1388 radv_emit_batch_break_on_new_ps(cmd_buffer);
1389
1390 for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
1391 if (!pipeline->shaders[i])
1392 continue;
1393
1394 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->shaders[i]->bo);
1395 }
1396
1397 if (radv_pipeline_has_gs_copy_shader(pipeline))
1398 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->gs_copy_shader->bo);
1399
1400 if (unlikely(cmd_buffer->device->trace_bo))
1401 radv_save_pipeline(cmd_buffer, pipeline);
1402
1403 cmd_buffer->state.emitted_pipeline = pipeline;
1404
1405 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1406 }
1407
1408 static void
radv_emit_viewport(struct radv_cmd_buffer * cmd_buffer)1409 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1410 {
1411 const struct radv_viewport_state *viewport = &cmd_buffer->state.dynamic.viewport;
1412 int i;
1413 const unsigned count = viewport->count;
1414
1415 assert(count);
1416 radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, count * 6);
1417
1418 for (i = 0; i < count; i++) {
1419 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[0]));
1420 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[0]));
1421 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[1]));
1422 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[1]));
1423 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[2]));
1424 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[2]));
1425 }
1426
1427 radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, count * 2);
1428 for (i = 0; i < count; i++) {
1429 float zmin = MIN2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1430 float zmax = MAX2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1431 radeon_emit(cmd_buffer->cs, fui(zmin));
1432 radeon_emit(cmd_buffer->cs, fui(zmax));
1433 }
1434 }
1435
1436 static void
radv_emit_scissor(struct radv_cmd_buffer * cmd_buffer)1437 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1438 {
1439 uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1440
1441 si_write_scissors(cmd_buffer->cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors,
1442 cmd_buffer->state.dynamic.viewport.viewports,
1443 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
1444
1445 cmd_buffer->state.context_roll_without_scissor_emitted = false;
1446 }
1447
1448 static void
radv_emit_discard_rectangle(struct radv_cmd_buffer * cmd_buffer)1449 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1450 {
1451 if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1452 return;
1453
1454 radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1455 cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1456 for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1457 VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1458 radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1459 radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1460 S_028214_BR_Y(rect.offset.y + rect.extent.height));
1461 }
1462 }
1463
1464 static void
radv_emit_line_width(struct radv_cmd_buffer * cmd_buffer)1465 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1466 {
1467 unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1468
1469 radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1470 S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF)));
1471 }
1472
1473 static void
radv_emit_blend_constants(struct radv_cmd_buffer * cmd_buffer)1474 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1475 {
1476 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1477
1478 radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1479 radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1480 }
1481
1482 static void
radv_emit_stencil(struct radv_cmd_buffer * cmd_buffer)1483 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1484 {
1485 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1486
1487 radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
1488 radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1489 S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1490 S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1491 S_028430_STENCILOPVAL(1));
1492 radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1493 S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1494 S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1495 S_028434_STENCILOPVAL_BF(1));
1496 }
1497
1498 static void
radv_emit_depth_bounds(struct radv_cmd_buffer * cmd_buffer)1499 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1500 {
1501 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1502
1503 radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
1504 radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.min));
1505 radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.max));
1506 }
1507
1508 static void
radv_emit_depth_bias(struct radv_cmd_buffer * cmd_buffer)1509 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1510 {
1511 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1512 unsigned slope = fui(d->depth_bias.slope * 16.0f);
1513
1514 radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1515 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1516 radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
1517 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* FRONT OFFSET */
1518 radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
1519 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* BACK OFFSET */
1520 }
1521
1522 static void
radv_emit_line_stipple(struct radv_cmd_buffer * cmd_buffer)1523 radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
1524 {
1525 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1526 uint32_t auto_reset_cntl = 1;
1527
1528 if (d->primitive_topology == V_008958_DI_PT_LINESTRIP)
1529 auto_reset_cntl = 2;
1530
1531 radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
1532 S_028A0C_LINE_PATTERN(d->line_stipple.pattern) |
1533 S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) |
1534 S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
1535 }
1536
1537 static void
radv_emit_culling(struct radv_cmd_buffer * cmd_buffer,uint64_t states)1538 radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1539 {
1540 unsigned pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
1541 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1542
1543 pa_su_sc_mode_cntl &= C_028814_CULL_FRONT &
1544 C_028814_CULL_BACK &
1545 C_028814_FACE &
1546 C_028814_POLY_OFFSET_FRONT_ENABLE &
1547 C_028814_POLY_OFFSET_BACK_ENABLE &
1548 C_028814_POLY_OFFSET_PARA_ENABLE;
1549
1550 pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) |
1551 S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) |
1552 S_028814_FACE(d->front_face) |
1553 S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) |
1554 S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) |
1555 S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable);
1556
1557 radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
1558 }
1559
1560 static void
radv_emit_primitive_topology(struct radv_cmd_buffer * cmd_buffer)1561 radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
1562 {
1563 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1564
1565 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
1566 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs,
1567 R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology);
1568 } else {
1569 radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology);
1570 }
1571 }
1572
1573 static void
radv_emit_depth_control(struct radv_cmd_buffer * cmd_buffer,uint64_t states)1574 radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1575 {
1576 unsigned db_depth_control = cmd_buffer->state.pipeline->graphics.db_depth_control;
1577 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1578
1579 db_depth_control &= C_028800_Z_ENABLE &
1580 C_028800_Z_WRITE_ENABLE &
1581 C_028800_ZFUNC &
1582 C_028800_DEPTH_BOUNDS_ENABLE &
1583 C_028800_STENCIL_ENABLE &
1584 C_028800_BACKFACE_ENABLE &
1585 C_028800_STENCILFUNC &
1586 C_028800_STENCILFUNC_BF;
1587
1588 db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) |
1589 S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) |
1590 S_028800_ZFUNC(d->depth_compare_op) |
1591 S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) |
1592 S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) |
1593 S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) |
1594 S_028800_STENCILFUNC(d->stencil_op.front.compare_op) |
1595 S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op);
1596
1597 radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
1598 }
1599
1600 static void
radv_emit_stencil_control(struct radv_cmd_buffer * cmd_buffer)1601 radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
1602 {
1603 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1604
1605 radeon_set_context_reg(
1606 cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
1607 S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) |
1608 S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) |
1609 S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) |
1610 S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) |
1611 S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) |
1612 S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op)));
1613 }
1614
1615 static void
radv_emit_fragment_shading_rate(struct radv_cmd_buffer * cmd_buffer)1616 radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
1617 {
1618 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1619 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1620 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1621 uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1;
1622 uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1;
1623 uint32_t pa_cl_vrs_cntl = pipeline->graphics.vrs.pa_cl_vrs_cntl;
1624 uint32_t vertex_comb_mode = d->fragment_shading_rate.combiner_ops[0];
1625 uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1];
1626
1627 if (subpass && !subpass->vrs_attachment) {
1628 /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
1629 * can cheat by tweaking the different combiner modes.
1630 */
1631 switch (htile_comb_mode) {
1632 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
1633 /* The result of min(A, 1x1) is always 1x1. */
1634 FALLTHROUGH;
1635 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
1636 /* Force the per-draw VRS rate to 1x1. */
1637 rate_x = rate_y = 0;
1638
1639 /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
1640 * combiner mode as passthrough.
1641 */
1642 vertex_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU;
1643 break;
1644 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
1645 /* The result of max(A, 1x1) is always A. */
1646 FALLTHROUGH;
1647 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
1648 /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
1649 break;
1650 default:
1651 break;
1652 }
1653 }
1654
1655 /* Emit per-draw VRS rate which is the first combiner. */
1656 radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE,
1657 S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
1658
1659 /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
1660 * draw rate and the vertex rate.
1661 */
1662 pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(vertex_comb_mode);
1663
1664 /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
1665 * rate.
1666 */
1667 pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
1668
1669 radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
1670 }
1671
1672 static void
radv_emit_primitive_restart_enable(struct radv_cmd_buffer * cmd_buffer)1673 radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
1674 {
1675 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1676
1677 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1678 radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
1679 d->primitive_restart_enable);
1680 } else {
1681 radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
1682 d->primitive_restart_enable);
1683 }
1684 }
1685
1686 static void
radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer * cmd_buffer)1687 radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer)
1688 {
1689 unsigned pa_cl_clip_cntl = cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl;
1690 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1691
1692 pa_cl_clip_cntl &= C_028810_DX_RASTERIZATION_KILL;
1693 pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable);
1694
1695 radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl);
1696 }
1697
1698 static void
radv_emit_logic_op(struct radv_cmd_buffer * cmd_buffer)1699 radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
1700 {
1701 unsigned cb_color_control = cmd_buffer->state.pipeline->graphics.cb_color_control;
1702 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1703
1704 cb_color_control &= C_028808_ROP3;
1705 cb_color_control |= S_028808_ROP3(d->logic_op);
1706
1707 radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
1708 }
1709
1710 static void
radv_emit_color_write_enable(struct radv_cmd_buffer * cmd_buffer)1711 radv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer)
1712 {
1713 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1714 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1715
1716 radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK,
1717 pipeline->graphics.cb_target_mask & d->color_write_enable);
1718 }
1719
1720 static void
radv_emit_fb_color_state(struct radv_cmd_buffer * cmd_buffer,int index,struct radv_color_buffer_info * cb,struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop,bool disable_dcc)1721 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index,
1722 struct radv_color_buffer_info *cb, struct radv_image_view *iview,
1723 VkImageLayout layout, bool in_render_loop, bool disable_dcc)
1724 {
1725 bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8;
1726 uint32_t cb_color_info = cb->cb_color_info;
1727 struct radv_image *image = iview->image;
1728
1729 if (!radv_layout_dcc_compressed(
1730 cmd_buffer->device, image, iview->base_mip, layout, in_render_loop,
1731 radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1732 cmd_buffer->queue_family_index)) ||
1733 disable_dcc) {
1734 cb_color_info &= C_028C70_DCC_ENABLE;
1735 }
1736
1737 if (!radv_layout_fmask_compressed(
1738 cmd_buffer->device, image, layout,
1739 radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1740 cmd_buffer->queue_family_index))) {
1741 cb_color_info &= C_028C70_COMPRESSION;
1742 }
1743
1744 if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
1745 radv_is_dcc_decompress_pipeline(cmd_buffer))) {
1746 /* If this bit is set, the FMASK decompression operation
1747 * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
1748 */
1749 cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
1750 }
1751
1752 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1753 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1754 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1755 radeon_emit(cmd_buffer->cs, 0);
1756 radeon_emit(cmd_buffer->cs, 0);
1757 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1758 radeon_emit(cmd_buffer->cs, cb_color_info);
1759 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1760 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1761 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1762 radeon_emit(cmd_buffer->cs, 0);
1763 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1764 radeon_emit(cmd_buffer->cs, 0);
1765
1766 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
1767
1768 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
1769 cb->cb_color_base >> 32);
1770 radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
1771 cb->cb_color_cmask >> 32);
1772 radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
1773 cb->cb_color_fmask >> 32);
1774 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
1775 cb->cb_dcc_base >> 32);
1776 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
1777 cb->cb_color_attrib2);
1778 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
1779 cb->cb_color_attrib3);
1780 } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1781 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1782 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1783 radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
1784 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
1785 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1786 radeon_emit(cmd_buffer->cs, cb_color_info);
1787 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1788 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1789 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1790 radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
1791 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1792 radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
1793
1794 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
1795 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1796 radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
1797
1798 radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
1799 cb->cb_mrt_epitch);
1800 } else {
1801 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1802 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1803 radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
1804 radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
1805 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1806 radeon_emit(cmd_buffer->cs, cb_color_info);
1807 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1808 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1809 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1810 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
1811 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1812 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
1813
1814 if (is_vi) { /* DCC BASE */
1815 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c,
1816 cb->cb_dcc_base);
1817 }
1818 }
1819
1820 if (G_028C70_DCC_ENABLE(cb_color_info)) {
1821 /* Drawing with DCC enabled also compresses colorbuffers. */
1822 VkImageSubresourceRange range = {
1823 .aspectMask = iview->aspect_mask,
1824 .baseMipLevel = iview->base_mip,
1825 .levelCount = iview->level_count,
1826 .baseArrayLayer = iview->base_layer,
1827 .layerCount = iview->layer_count,
1828 };
1829
1830 radv_update_dcc_metadata(cmd_buffer, image, &range, true);
1831 }
1832 }
1833
1834 static void
radv_update_zrange_precision(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,const struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop,bool requires_cond_exec)1835 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
1836 const struct radv_image_view *iview, VkImageLayout layout,
1837 bool in_render_loop, bool requires_cond_exec)
1838 {
1839 const struct radv_image *image = iview->image;
1840 uint32_t db_z_info = ds->db_z_info;
1841 uint32_t db_z_info_reg;
1842
1843 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
1844 !radv_image_is_tc_compat_htile(image))
1845 return;
1846
1847 if (!radv_layout_is_htile_compressed(
1848 cmd_buffer->device, image, layout, in_render_loop,
1849 radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1850 cmd_buffer->queue_family_index))) {
1851 db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1852 }
1853
1854 db_z_info &= C_028040_ZRANGE_PRECISION;
1855
1856 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1857 db_z_info_reg = R_028038_DB_Z_INFO;
1858 } else {
1859 db_z_info_reg = R_028040_DB_Z_INFO;
1860 }
1861
1862 /* When we don't know the last fast clear value we need to emit a
1863 * conditional packet that will eventually skip the following
1864 * SET_CONTEXT_REG packet.
1865 */
1866 if (requires_cond_exec) {
1867 uint64_t va = radv_get_tc_compat_zrange_va(image, iview->base_mip);
1868
1869 radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
1870 radeon_emit(cmd_buffer->cs, va);
1871 radeon_emit(cmd_buffer->cs, va >> 32);
1872 radeon_emit(cmd_buffer->cs, 0);
1873 radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
1874 }
1875
1876 radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
1877 }
1878
1879 static void
radv_emit_fb_ds_state(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop)1880 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
1881 struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop)
1882 {
1883 const struct radv_image *image = iview->image;
1884 uint32_t db_z_info = ds->db_z_info;
1885 uint32_t db_stencil_info = ds->db_stencil_info;
1886
1887 if (!radv_layout_is_htile_compressed(
1888 cmd_buffer->device, image, layout, in_render_loop,
1889 radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1890 cmd_buffer->queue_family_index))) {
1891 db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1892 db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
1893 }
1894
1895 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
1896 radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
1897
1898 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1899 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1900 radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
1901
1902 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
1903 radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
1904 radeon_emit(cmd_buffer->cs, db_z_info);
1905 radeon_emit(cmd_buffer->cs, db_stencil_info);
1906 radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1907 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1908 radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1909 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1910
1911 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
1912 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1913 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1914 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1915 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1916 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
1917 } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1918 radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
1919 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
1920 radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
1921 radeon_emit(cmd_buffer->cs, ds->db_depth_size);
1922
1923 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
1924 radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */
1925 radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */
1926 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
1927 radeon_emit(cmd_buffer->cs,
1928 S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
1929 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */
1930 radeon_emit(cmd_buffer->cs,
1931 S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
1932 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */
1933 radeon_emit(cmd_buffer->cs,
1934 S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
1935 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */
1936 radeon_emit(cmd_buffer->cs,
1937 S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
1938
1939 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
1940 radeon_emit(cmd_buffer->cs, ds->db_z_info2);
1941 radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
1942 } else {
1943 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1944
1945 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
1946 radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */
1947 radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */
1948 radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */
1949 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */
1950 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */
1951 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */
1952 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
1953 radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */
1954 radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */
1955 }
1956
1957 /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
1958 radv_update_zrange_precision(cmd_buffer, ds, iview, layout, in_render_loop, true);
1959
1960 radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
1961 ds->pa_su_poly_offset_db_fmt_cntl);
1962 }
1963
1964 /**
1965 * Update the fast clear depth/stencil values if the image is bound as a
1966 * depth/stencil buffer.
1967 */
1968 static void
radv_update_bound_fast_clear_ds(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)1969 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
1970 const struct radv_image_view *iview,
1971 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
1972 {
1973 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1974 const struct radv_image *image = iview->image;
1975 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1976 uint32_t att_idx;
1977
1978 if (!cmd_buffer->state.attachments || !subpass)
1979 return;
1980
1981 if (!subpass->depth_stencil_attachment)
1982 return;
1983
1984 att_idx = subpass->depth_stencil_attachment->attachment;
1985 if (cmd_buffer->state.attachments[att_idx].iview->image != image)
1986 return;
1987
1988 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
1989 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
1990 radeon_emit(cs, ds_clear_value.stencil);
1991 radeon_emit(cs, fui(ds_clear_value.depth));
1992 } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
1993 radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
1994 } else {
1995 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
1996 radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
1997 }
1998
1999 /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
2000 * only needed when clearing Z to 0.0.
2001 */
2002 if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
2003 VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2004 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2005
2006 radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview,
2007 layout, in_render_loop, false);
2008 }
2009
2010 cmd_buffer->state.context_roll_without_scissor_emitted = true;
2011 }
2012
2013 /**
2014 * Set the clear depth/stencil values to the image's metadata.
2015 */
2016 static void
radv_set_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)2017 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2018 const VkImageSubresourceRange *range,
2019 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2020 {
2021 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2022 uint32_t level_count = radv_get_levelCount(image, range);
2023
2024 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
2025 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
2026
2027 /* Use the fastest way when both aspects are used. */
2028 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
2029 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2030 radeon_emit(cs, va);
2031 radeon_emit(cs, va >> 32);
2032
2033 for (uint32_t l = 0; l < level_count; l++) {
2034 radeon_emit(cs, ds_clear_value.stencil);
2035 radeon_emit(cs, fui(ds_clear_value.depth));
2036 }
2037 } else {
2038 /* Otherwise we need one WRITE_DATA packet per level. */
2039 for (uint32_t l = 0; l < level_count; l++) {
2040 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
2041 unsigned value;
2042
2043 if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
2044 value = fui(ds_clear_value.depth);
2045 va += 4;
2046 } else {
2047 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
2048 value = ds_clear_value.stencil;
2049 }
2050
2051 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
2052 radeon_emit(cs,
2053 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2054 radeon_emit(cs, va);
2055 radeon_emit(cs, va >> 32);
2056 radeon_emit(cs, value);
2057 }
2058 }
2059 }
2060
2061 /**
2062 * Update the TC-compat metadata value for this image.
2063 */
2064 static void
radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)2065 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2066 const VkImageSubresourceRange *range, uint32_t value)
2067 {
2068 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2069
2070 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
2071 return;
2072
2073 uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
2074 uint32_t level_count = radv_get_levelCount(image, range);
2075
2076 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
2077 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2078 radeon_emit(cs, va);
2079 radeon_emit(cs, va >> 32);
2080
2081 for (uint32_t l = 0; l < level_count; l++)
2082 radeon_emit(cs, value);
2083 }
2084
2085 static void
radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value)2086 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
2087 const struct radv_image_view *iview,
2088 VkClearDepthStencilValue ds_clear_value)
2089 {
2090 VkImageSubresourceRange range = {
2091 .aspectMask = iview->aspect_mask,
2092 .baseMipLevel = iview->base_mip,
2093 .levelCount = iview->level_count,
2094 .baseArrayLayer = iview->base_layer,
2095 .layerCount = iview->layer_count,
2096 };
2097 uint32_t cond_val;
2098
2099 /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
2100 * depth clear value is 0.0f.
2101 */
2102 cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
2103
2104 radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
2105 }
2106
2107 /**
2108 * Update the clear depth/stencil values for this image.
2109 */
2110 void
radv_update_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)2111 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2112 const struct radv_image_view *iview,
2113 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2114 {
2115 VkImageSubresourceRange range = {
2116 .aspectMask = iview->aspect_mask,
2117 .baseMipLevel = iview->base_mip,
2118 .levelCount = iview->level_count,
2119 .baseArrayLayer = iview->base_layer,
2120 .layerCount = iview->layer_count,
2121 };
2122 struct radv_image *image = iview->image;
2123
2124 assert(radv_htile_enabled(image, range.baseMipLevel));
2125
2126 radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
2127
2128 if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
2129 radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
2130 }
2131
2132 radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
2133 }
2134
2135 /**
2136 * Load the clear depth/stencil values from the image's metadata.
2137 */
2138 static void
radv_load_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview)2139 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
2140 {
2141 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2142 const struct radv_image *image = iview->image;
2143 VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
2144 uint64_t va = radv_get_ds_clear_value_va(image, iview->base_mip);
2145 unsigned reg_offset = 0, reg_count = 0;
2146
2147 assert(radv_image_has_htile(image));
2148
2149 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
2150 ++reg_count;
2151 } else {
2152 ++reg_offset;
2153 va += 4;
2154 }
2155 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2156 ++reg_count;
2157
2158 uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
2159
2160 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2161 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
2162 radeon_emit(cs, va);
2163 radeon_emit(cs, va >> 32);
2164 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2165 radeon_emit(cs, reg_count);
2166 } else {
2167 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2168 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2169 (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
2170 radeon_emit(cs, va);
2171 radeon_emit(cs, va >> 32);
2172 radeon_emit(cs, reg >> 2);
2173 radeon_emit(cs, 0);
2174
2175 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
2176 radeon_emit(cs, 0);
2177 }
2178 }
2179
2180 /*
2181 * With DCC some colors don't require CMASK elimination before being
2182 * used as a texture. This sets a predicate value to determine if the
2183 * cmask eliminate is required.
2184 */
2185 void
radv_update_fce_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)2186 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2187 const VkImageSubresourceRange *range, bool value)
2188 {
2189 if (!image->fce_pred_offset)
2190 return;
2191
2192 uint64_t pred_val = value;
2193 uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
2194 uint32_t level_count = radv_get_levelCount(image, range);
2195 uint32_t count = 2 * level_count;
2196
2197 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2198 radeon_emit(cmd_buffer->cs,
2199 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2200 radeon_emit(cmd_buffer->cs, va);
2201 radeon_emit(cmd_buffer->cs, va >> 32);
2202
2203 for (uint32_t l = 0; l < level_count; l++) {
2204 radeon_emit(cmd_buffer->cs, pred_val);
2205 radeon_emit(cmd_buffer->cs, pred_val >> 32);
2206 }
2207 }
2208
2209 /**
2210 * Update the DCC predicate to reflect the compression state.
2211 */
2212 void
radv_update_dcc_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)2213 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2214 const VkImageSubresourceRange *range, bool value)
2215 {
2216 if (image->dcc_pred_offset == 0)
2217 return;
2218
2219 uint64_t pred_val = value;
2220 uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
2221 uint32_t level_count = radv_get_levelCount(image, range);
2222 uint32_t count = 2 * level_count;
2223
2224 assert(radv_dcc_enabled(image, range->baseMipLevel));
2225
2226 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2227 radeon_emit(cmd_buffer->cs,
2228 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2229 radeon_emit(cmd_buffer->cs, va);
2230 radeon_emit(cmd_buffer->cs, va >> 32);
2231
2232 for (uint32_t l = 0; l < level_count; l++) {
2233 radeon_emit(cmd_buffer->cs, pred_val);
2234 radeon_emit(cmd_buffer->cs, pred_val >> 32);
2235 }
2236 }
2237
2238 /**
2239 * Update the fast clear color values if the image is bound as a color buffer.
2240 */
2241 static void
radv_update_bound_fast_clear_color(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,int cb_idx,uint32_t color_values[2])2242 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2243 int cb_idx, uint32_t color_values[2])
2244 {
2245 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2246 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2247 uint32_t att_idx;
2248
2249 if (!cmd_buffer->state.attachments || !subpass)
2250 return;
2251
2252 att_idx = subpass->color_attachments[cb_idx].attachment;
2253 if (att_idx == VK_ATTACHMENT_UNUSED)
2254 return;
2255
2256 if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2257 return;
2258
2259 radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
2260 radeon_emit(cs, color_values[0]);
2261 radeon_emit(cs, color_values[1]);
2262
2263 cmd_buffer->state.context_roll_without_scissor_emitted = true;
2264 }
2265
2266 /**
2267 * Set the clear color values to the image's metadata.
2268 */
2269 static void
radv_set_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t color_values[2])2270 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2271 const VkImageSubresourceRange *range, uint32_t color_values[2])
2272 {
2273 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2274 uint32_t level_count = radv_get_levelCount(image, range);
2275 uint32_t count = 2 * level_count;
2276
2277 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
2278
2279 if (radv_image_has_clear_value(image)) {
2280 uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
2281
2282 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
2283 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2284 radeon_emit(cs, va);
2285 radeon_emit(cs, va >> 32);
2286
2287 for (uint32_t l = 0; l < level_count; l++) {
2288 radeon_emit(cs, color_values[0]);
2289 radeon_emit(cs, color_values[1]);
2290 }
2291 } else {
2292 /* Some default value we can set in the update. */
2293 assert(color_values[0] == 0 && color_values[1] == 0);
2294 }
2295 }
2296
2297 /**
2298 * Update the clear color values for this image.
2299 */
2300 void
radv_update_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,int cb_idx,uint32_t color_values[2])2301 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2302 const struct radv_image_view *iview, int cb_idx,
2303 uint32_t color_values[2])
2304 {
2305 struct radv_image *image = iview->image;
2306 VkImageSubresourceRange range = {
2307 .aspectMask = iview->aspect_mask,
2308 .baseMipLevel = iview->base_mip,
2309 .levelCount = iview->level_count,
2310 .baseArrayLayer = iview->base_layer,
2311 .layerCount = iview->layer_count,
2312 };
2313
2314 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->base_mip));
2315
2316 /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
2317 * mode because the hardware gets the value from the image directly.
2318 */
2319 if (iview->image->support_comp_to_single)
2320 return;
2321
2322 radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
2323
2324 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2325 }
2326
2327 /**
2328 * Load the clear color values from the image's metadata.
2329 */
2330 static void
radv_load_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * iview,int cb_idx)2331 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview,
2332 int cb_idx)
2333 {
2334 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2335 struct radv_image *image = iview->image;
2336
2337 if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->base_mip))
2338 return;
2339
2340 if (iview->image->support_comp_to_single)
2341 return;
2342
2343 if (!radv_image_has_clear_value(image)) {
2344 uint32_t color_values[2] = {0, 0};
2345 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2346 return;
2347 }
2348
2349 uint64_t va = radv_image_get_fast_clear_va(image, iview->base_mip);
2350 uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
2351
2352 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2353 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
2354 radeon_emit(cs, va);
2355 radeon_emit(cs, va >> 32);
2356 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2357 radeon_emit(cs, 2);
2358 } else {
2359 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
2360 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2361 COPY_DATA_COUNT_SEL);
2362 radeon_emit(cs, va);
2363 radeon_emit(cs, va >> 32);
2364 radeon_emit(cs, reg >> 2);
2365 radeon_emit(cs, 0);
2366
2367 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
2368 radeon_emit(cs, 0);
2369 }
2370 }
2371
2372 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
2373 * broken if the CB caches data of multiple mips of the same image at the
2374 * same time.
2375 *
2376 * Insert some flushes to avoid this.
2377 */
2378 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)2379 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
2380 {
2381 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2382 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2383 bool color_mip_changed = false;
2384
2385 /* Entire workaround is not applicable before GFX9 */
2386 if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
2387 return;
2388
2389 if (!framebuffer)
2390 return;
2391
2392 for (int i = 0; i < subpass->color_count; ++i) {
2393 int idx = subpass->color_attachments[i].attachment;
2394 if (idx == VK_ATTACHMENT_UNUSED)
2395 continue;
2396
2397 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2398
2399 if ((radv_image_has_CB_metadata(iview->image) ||
2400 radv_dcc_enabled(iview->image, iview->base_mip) ||
2401 radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
2402 cmd_buffer->state.cb_mip[i] != iview->base_mip)
2403 color_mip_changed = true;
2404
2405 cmd_buffer->state.cb_mip[i] = iview->base_mip;
2406 }
2407
2408 if (color_mip_changed) {
2409 cmd_buffer->state.flush_bits |=
2410 RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2411 }
2412 }
2413
2414 /* This function does the flushes for mip changes if the levels are not zero for
2415 * all render targets. This way we can assume at the start of the next cmd_buffer
2416 * that rendering to mip 0 doesn't need any flushes. As that is the most common
2417 * case that saves some flushes. */
2418 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)2419 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
2420 {
2421 /* Entire workaround is not applicable before GFX9 */
2422 if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
2423 return;
2424
2425 bool need_color_mip_flush = false;
2426 for (unsigned i = 0; i < 8; ++i) {
2427 if (cmd_buffer->state.cb_mip[i]) {
2428 need_color_mip_flush = true;
2429 break;
2430 }
2431 }
2432
2433 if (need_color_mip_flush) {
2434 cmd_buffer->state.flush_bits |=
2435 RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2436 }
2437
2438 memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
2439 }
2440
2441 static struct radv_image *
radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer * cmd_buffer)2442 radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
2443 {
2444 struct radv_device *device = cmd_buffer->device;
2445
2446 if (!device->vrs.image) {
2447 VkResult result;
2448
2449 /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
2450 result = radv_device_init_vrs_state(device);
2451 if (result != VK_SUCCESS) {
2452 cmd_buffer->record_result = result;
2453 return NULL;
2454 }
2455 }
2456
2457 return device->vrs.image;
2458 }
2459
2460 static void
radv_emit_framebuffer_state(struct radv_cmd_buffer * cmd_buffer)2461 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
2462 {
2463 int i;
2464 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2465 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2466
2467 /* this may happen for inherited secondary recording */
2468 if (!framebuffer)
2469 return;
2470
2471 for (i = 0; i < 8; ++i) {
2472 if (i >= subpass->color_count ||
2473 subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
2474 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
2475 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
2476 continue;
2477 }
2478
2479 int idx = subpass->color_attachments[i].attachment;
2480 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2481 VkImageLayout layout = subpass->color_attachments[i].layout;
2482 bool in_render_loop = subpass->color_attachments[i].in_render_loop;
2483
2484 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bo);
2485
2486 assert(iview->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
2487 VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
2488 radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout,
2489 in_render_loop, cmd_buffer->state.attachments[idx].disable_dcc);
2490
2491 radv_load_color_clear_metadata(cmd_buffer, iview, i);
2492 }
2493
2494 if (subpass->depth_stencil_attachment) {
2495 int idx = subpass->depth_stencil_attachment->attachment;
2496 VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2497 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2498 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2499 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2500 cmd_buffer->state.attachments[idx].iview->image->bo);
2501
2502 radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout,
2503 in_render_loop);
2504
2505 if (radv_layout_is_htile_compressed(
2506 cmd_buffer->device, iview->image, layout, in_render_loop,
2507 radv_image_queue_family_mask(iview->image, cmd_buffer->queue_family_index,
2508 cmd_buffer->queue_family_index))) {
2509 /* Only load the depth/stencil fast clear values when
2510 * compressed rendering is enabled.
2511 */
2512 radv_load_ds_clear_metadata(cmd_buffer, iview);
2513 }
2514 } else if (subpass->vrs_attachment && cmd_buffer->device->vrs.image) {
2515 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
2516 * bind our internal depth buffer that contains the VRS data as part of HTILE.
2517 */
2518 VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
2519 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
2520 struct radv_image *image = cmd_buffer->device->vrs.image;
2521 struct radv_ds_buffer_info ds;
2522 struct radv_image_view iview;
2523
2524 radv_image_view_init(&iview, cmd_buffer->device,
2525 &(VkImageViewCreateInfo){
2526 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2527 .image = radv_image_to_handle(image),
2528 .viewType = radv_meta_get_view_type(image),
2529 .format = image->vk_format,
2530 .subresourceRange =
2531 {
2532 .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
2533 .baseMipLevel = 0,
2534 .levelCount = 1,
2535 .baseArrayLayer = 0,
2536 .layerCount = 1,
2537 },
2538 },
2539 NULL);
2540
2541 radv_initialise_vrs_surface(image, htile_buffer, &ds);
2542
2543 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo);
2544
2545 radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false);
2546
2547 radv_image_view_finish(&iview);
2548 } else {
2549 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9)
2550 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
2551 else
2552 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
2553
2554 radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
2555 radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
2556 }
2557 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
2558 S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height));
2559
2560 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) {
2561 bool disable_constant_encode =
2562 cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
2563 enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
2564 uint8_t watermark = chip_class >= GFX10 ? 6 : 4;
2565
2566 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
2567 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(chip_class <= GFX9) |
2568 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
2569 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
2570 }
2571
2572 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
2573 }
2574
2575 static void
radv_emit_index_buffer(struct radv_cmd_buffer * cmd_buffer,bool indirect)2576 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
2577 {
2578 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2579 struct radv_cmd_state *state = &cmd_buffer->state;
2580
2581 if (state->index_type != state->last_index_type) {
2582 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
2583 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
2584 R_03090C_VGT_INDEX_TYPE, 2, state->index_type);
2585 } else {
2586 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
2587 radeon_emit(cs, state->index_type);
2588 }
2589
2590 state->last_index_type = state->index_type;
2591 }
2592
2593 /* For the direct indexed draws we use DRAW_INDEX_2, which includes
2594 * the index_va and max_index_count already. */
2595 if (!indirect)
2596 return;
2597
2598 radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
2599 radeon_emit(cs, state->index_va);
2600 radeon_emit(cs, state->index_va >> 32);
2601
2602 radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
2603 radeon_emit(cs, state->max_index_count);
2604
2605 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
2606 }
2607
2608 void
radv_set_db_count_control(struct radv_cmd_buffer * cmd_buffer)2609 radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
2610 {
2611 bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
2612 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2613 uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
2614 uint32_t db_count_control;
2615
2616 if (!cmd_buffer->state.active_occlusion_queries) {
2617 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2618 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2619 pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2620 /* Re-enable out-of-order rasterization if the
2621 * bound pipeline supports it and if it's has
2622 * been disabled before starting any perfect
2623 * occlusion queries.
2624 */
2625 radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2626 }
2627 }
2628 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
2629 } else {
2630 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2631 uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
2632 bool gfx10_perfect =
2633 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10 && has_perfect_queries;
2634
2635 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2636 /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
2637 * covered tiles, discards, and early depth testing. For more details,
2638 * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
2639 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
2640 S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
2641 S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) |
2642 S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
2643
2644 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2645 pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2646 /* If the bound pipeline has enabled
2647 * out-of-order rasterization, we should
2648 * disable it before starting any perfect
2649 * occlusion queries.
2650 */
2651 pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
2652
2653 radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2654 }
2655 } else {
2656 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate);
2657 }
2658 }
2659
2660 radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
2661
2662 cmd_buffer->state.context_roll_without_scissor_emitted = true;
2663 }
2664
2665 unsigned
radv_instance_rate_prolog_index(unsigned num_attributes,uint32_t instance_rate_inputs)2666 radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
2667 {
2668 /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
2669 * single array sorted in ascending order using:
2670 * - total number of attributes
2671 * - number of instanced attributes
2672 * - index of first instanced attribute
2673 */
2674
2675 /* From total number of attributes to offset. */
2676 static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84,
2677 120, 165, 220, 286, 364, 455, 560, 680};
2678 unsigned start_index = total_to_offset[num_attributes - 1];
2679
2680 /* From number of instanced attributes to offset. This would require a different LUT depending on
2681 * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
2682 * attributes.
2683 */
2684 static const uint8_t count_to_offset_total16[16] = {0, 16, 31, 45, 58, 70, 81, 91,
2685 100, 108, 115, 121, 126, 130, 133, 135};
2686 unsigned count = util_bitcount(instance_rate_inputs);
2687 unsigned offset_from_start_index =
2688 count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
2689
2690 unsigned first = ffs(instance_rate_inputs) - 1;
2691 return start_index + offset_from_start_index + first;
2692 }
2693
2694 union vs_prolog_key_header {
2695 struct {
2696 uint32_t key_size : 8;
2697 uint32_t num_attributes : 6;
2698 uint32_t as_ls : 1;
2699 uint32_t is_ngg : 1;
2700 uint32_t wave32 : 1;
2701 uint32_t next_stage : 3;
2702 uint32_t instance_rate_inputs : 1;
2703 uint32_t alpha_adjust_lo : 1;
2704 uint32_t alpha_adjust_hi : 1;
2705 uint32_t misaligned_mask : 1;
2706 uint32_t post_shuffle : 1;
2707 uint32_t nontrivial_divisors : 1;
2708 /* We need this to ensure the padding is zero. It's useful even if it's unused. */
2709 uint32_t padding0 : 6;
2710 };
2711 uint32_t v;
2712 };
2713
2714 uint32_t
radv_hash_vs_prolog(const void * key_)2715 radv_hash_vs_prolog(const void *key_)
2716 {
2717 const uint32_t *key = key_;
2718 union vs_prolog_key_header header;
2719 header.v = key[0];
2720 return _mesa_hash_data(key, header.key_size);
2721 }
2722
2723 bool
radv_cmp_vs_prolog(const void * a_,const void * b_)2724 radv_cmp_vs_prolog(const void *a_, const void *b_)
2725 {
2726 const uint32_t *a = a_;
2727 const uint32_t *b = b_;
2728 if (a[0] != b[0])
2729 return false;
2730
2731 union vs_prolog_key_header header;
2732 header.v = a[0];
2733 return memcmp(a, b, header.key_size) == 0;
2734 }
2735
2736 static struct radv_shader_prolog *
lookup_vs_prolog(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_variant * vs_shader,uint32_t * nontrivial_divisors)2737 lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
2738 uint32_t *nontrivial_divisors)
2739 {
2740 STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
2741 assert(vs_shader->info.vs.dynamic_inputs);
2742
2743 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
2744 const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2745 struct radv_device *device = cmd_buffer->device;
2746
2747 unsigned num_attributes = pipeline->last_vertex_attrib_bit;
2748 uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
2749
2750 uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
2751 *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
2752 enum chip_class chip = device->physical_device->rad_info.chip_class;
2753 const uint32_t misaligned_mask = chip == GFX6 || chip >= GFX10 ? cmd_buffer->state.vbo_misaligned_mask : 0;
2754
2755 /* try to use a pre-compiled prolog first */
2756 struct radv_shader_prolog *prolog = NULL;
2757 if (pipeline->can_use_simple_input &&
2758 (!vs_shader->info.vs.as_ls || !instance_rate_inputs) &&
2759 !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) {
2760 if (!instance_rate_inputs) {
2761 prolog = device->simple_vs_prologs[num_attributes - 1];
2762 } else if (num_attributes <= 16 && !*nontrivial_divisors &&
2763 util_bitcount(instance_rate_inputs) ==
2764 (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
2765 unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
2766 prolog = device->instance_rate_vs_prologs[index];
2767 }
2768 }
2769 if (prolog)
2770 return prolog;
2771
2772 /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
2773 uint32_t key_words[16];
2774 unsigned key_size = 1;
2775
2776 struct radv_vs_prolog_key key;
2777 key.state = state;
2778 key.num_attributes = num_attributes;
2779 key.misaligned_mask = misaligned_mask;
2780 /* The instance ID input VGPR is placed differently when as_ls=true. */
2781 key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs;
2782 key.is_ngg = vs_shader->info.is_ngg;
2783 key.wave32 = vs_shader->info.wave_size == 32;
2784 key.next_stage = pipeline->next_vertex_stage;
2785
2786 union vs_prolog_key_header header;
2787 header.v = 0;
2788 header.num_attributes = num_attributes;
2789 header.as_ls = key.as_ls;
2790 header.is_ngg = key.is_ngg;
2791 header.wave32 = key.wave32;
2792 header.next_stage = key.next_stage;
2793
2794 if (instance_rate_inputs & ~*nontrivial_divisors) {
2795 header.instance_rate_inputs = true;
2796 key_words[key_size++] = instance_rate_inputs;
2797 }
2798 if (*nontrivial_divisors) {
2799 header.nontrivial_divisors = true;
2800 key_words[key_size++] = *nontrivial_divisors;
2801 }
2802 if (misaligned_mask) {
2803 header.misaligned_mask = true;
2804 key_words[key_size++] = misaligned_mask;
2805
2806 uint8_t *formats = (uint8_t *)&key_words[key_size];
2807 unsigned num_formats = 0;
2808 u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index];
2809 while (num_formats & 0x3)
2810 formats[num_formats++] = 0;
2811 key_size += num_formats / 4u;
2812
2813 if (state->post_shuffle & attribute_mask) {
2814 header.post_shuffle = true;
2815 key_words[key_size++] = state->post_shuffle & attribute_mask;
2816 }
2817 }
2818 if (state->alpha_adjust_lo & attribute_mask) {
2819 header.alpha_adjust_lo = true;
2820 key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
2821 }
2822 if (state->alpha_adjust_hi & attribute_mask) {
2823 header.alpha_adjust_hi = true;
2824 key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
2825 }
2826
2827 header.key_size = key_size * sizeof(key_words[0]);
2828 key_words[0] = header.v;
2829
2830 uint32_t hash = radv_hash_vs_prolog(key_words);
2831
2832 if (cmd_buffer->state.emitted_vs_prolog &&
2833 cmd_buffer->state.emitted_vs_prolog_key_hash == hash &&
2834 radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key))
2835 return cmd_buffer->state.emitted_vs_prolog;
2836
2837 u_rwlock_rdlock(&device->vs_prologs_lock);
2838 struct hash_entry *prolog_entry =
2839 _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
2840 u_rwlock_rdunlock(&device->vs_prologs_lock);
2841
2842 if (!prolog_entry) {
2843 u_rwlock_wrlock(&device->vs_prologs_lock);
2844 prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
2845 if (prolog_entry) {
2846 u_rwlock_wrunlock(&device->vs_prologs_lock);
2847 return prolog_entry->data;
2848 }
2849
2850 prolog = radv_create_vs_prolog(device, &key);
2851 uint32_t *key2 = malloc(key_size * 4);
2852 if (!prolog || !key2) {
2853 radv_prolog_destroy(device, prolog);
2854 free(key2);
2855 u_rwlock_wrunlock(&device->vs_prologs_lock);
2856 return NULL;
2857 }
2858 memcpy(key2, key_words, key_size * 4);
2859 _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
2860
2861 u_rwlock_wrunlock(&device->vs_prologs_lock);
2862 return prolog;
2863 }
2864
2865 return prolog_entry->data;
2866 }
2867
2868 static void
emit_prolog_regs(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_variant * vs_shader,struct radv_shader_prolog * prolog,bool pipeline_is_dirty)2869 emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
2870 struct radv_shader_prolog *prolog, bool pipeline_is_dirty)
2871 {
2872 /* no need to re-emit anything in this case */
2873 if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty)
2874 return;
2875
2876 enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
2877 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2878 uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
2879
2880 assert(cmd_buffer->state.emitted_pipeline == cmd_buffer->state.pipeline);
2881 assert(vs_shader->info.num_input_sgprs <= prolog->num_preserved_sgprs);
2882
2883 uint32_t rsrc1 = vs_shader->config.rsrc1;
2884 if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1))
2885 rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
2886
2887 /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
2888 * work.
2889 */
2890 assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
2891
2892 unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
2893 unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
2894 if (vs_shader->info.is_ngg || pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) {
2895 pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
2896 rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
2897 } else if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader) {
2898 pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
2899 rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
2900 } else if (vs_shader->info.vs.as_ls) {
2901 pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
2902 rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
2903 } else if (vs_shader->info.vs.as_es) {
2904 pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
2905 rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
2906 }
2907
2908 radeon_set_sh_reg_seq(cmd_buffer->cs, pgm_lo_reg, 2);
2909 radeon_emit(cmd_buffer->cs, prolog_va >> 8);
2910 radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(prolog_va >> 40));
2911
2912 if (chip < GFX10)
2913 radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
2914 else
2915 assert(rsrc1 == vs_shader->config.rsrc1);
2916
2917 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
2918 }
2919
2920 static void
emit_prolog_inputs(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_variant * vs_shader,uint32_t nontrivial_divisors,bool pipeline_is_dirty)2921 emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
2922 uint32_t nontrivial_divisors, bool pipeline_is_dirty)
2923 {
2924 /* no need to re-emit anything in this case */
2925 if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog &&
2926 !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
2927 return;
2928
2929 struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
2930 uint64_t input_va = radv_shader_variant_get_va(vs_shader);
2931
2932 if (nontrivial_divisors) {
2933 unsigned inputs_offset;
2934 uint32_t *inputs;
2935 unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
2936 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
2937 return;
2938
2939 *(inputs++) = input_va;
2940 *(inputs++) = input_va >> 32;
2941
2942 u_foreach_bit(index, nontrivial_divisors)
2943 {
2944 uint32_t div = state->divisors[index];
2945 if (div == 0) {
2946 *(inputs++) = 0;
2947 *(inputs++) = 1;
2948 } else if (util_is_power_of_two_or_zero(div)) {
2949 *(inputs++) = util_logbase2(div) | (1 << 8);
2950 *(inputs++) = 0xffffffffu;
2951 } else {
2952 struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
2953 *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
2954 *(inputs++) = info.multiplier;
2955 }
2956 }
2957
2958 input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
2959 }
2960
2961 struct radv_userdata_info *loc =
2962 &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
2963 uint32_t base_reg = cmd_buffer->state.pipeline->user_data_0[MESA_SHADER_VERTEX];
2964 assert(loc->sgpr_idx != -1);
2965 assert(loc->num_sgprs == 2);
2966 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
2967 input_va, true);
2968 }
2969
2970 static void
radv_emit_vertex_input(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)2971 radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
2972 {
2973 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2974 struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
2975
2976 if (!vs_shader->info.vs.has_prolog)
2977 return;
2978
2979 uint32_t nontrivial_divisors;
2980 struct radv_shader_prolog *prolog =
2981 lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
2982 if (!prolog) {
2983 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
2984 return;
2985 }
2986 emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
2987 emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
2988
2989 cmd_buffer->state.emitted_vs_prolog = prolog;
2990 }
2991
2992 static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)2993 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
2994 {
2995 uint64_t states =
2996 cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
2997
2998 if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
2999 radv_emit_viewport(cmd_buffer);
3000
3001 if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
3002 !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
3003 radv_emit_scissor(cmd_buffer);
3004
3005 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
3006 radv_emit_line_width(cmd_buffer);
3007
3008 if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
3009 radv_emit_blend_constants(cmd_buffer);
3010
3011 if (states &
3012 (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
3013 RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
3014 radv_emit_stencil(cmd_buffer);
3015
3016 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
3017 radv_emit_depth_bounds(cmd_buffer);
3018
3019 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
3020 radv_emit_depth_bias(cmd_buffer);
3021
3022 if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
3023 radv_emit_discard_rectangle(cmd_buffer);
3024
3025 if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
3026 radv_emit_sample_locations(cmd_buffer);
3027
3028 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE)
3029 radv_emit_line_stipple(cmd_buffer);
3030
3031 if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
3032 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE))
3033 radv_emit_culling(cmd_buffer, states);
3034
3035 if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
3036 radv_emit_primitive_topology(cmd_buffer);
3037
3038 if (states &
3039 (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
3040 RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
3041 RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
3042 radv_emit_depth_control(cmd_buffer, states);
3043
3044 if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
3045 radv_emit_stencil_control(cmd_buffer);
3046
3047 if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE)
3048 radv_emit_fragment_shading_rate(cmd_buffer);
3049
3050 if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
3051 radv_emit_primitive_restart_enable(cmd_buffer);
3052
3053 if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
3054 radv_emit_rasterizer_discard_enable(cmd_buffer);
3055
3056 if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP)
3057 radv_emit_logic_op(cmd_buffer);
3058
3059 if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
3060 radv_emit_color_write_enable(cmd_buffer);
3061
3062 if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT)
3063 radv_emit_vertex_input(cmd_buffer, pipeline_is_dirty);
3064
3065 cmd_buffer->state.dirty &= ~states;
3066 }
3067
3068 static void
radv_flush_push_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)3069 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
3070 {
3071 struct radv_descriptor_state *descriptors_state =
3072 radv_get_descriptors_state(cmd_buffer, bind_point);
3073 struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
3074 unsigned bo_offset;
3075
3076 if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr,
3077 &bo_offset))
3078 return;
3079
3080 set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3081 set->header.va += bo_offset;
3082 }
3083
3084 static void
radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3085 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
3086 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3087 {
3088 struct radv_descriptor_state *descriptors_state =
3089 radv_get_descriptors_state(cmd_buffer, bind_point);
3090 uint32_t size = MAX_SETS * 4;
3091 uint32_t offset;
3092 void *ptr;
3093
3094 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
3095 return;
3096
3097 for (unsigned i = 0; i < MAX_SETS; i++) {
3098 uint32_t *uptr = ((uint32_t *)ptr) + i;
3099 uint64_t set_va = 0;
3100 struct radv_descriptor_set *set = descriptors_state->sets[i];
3101 if (descriptors_state->valid & (1u << i))
3102 set_va = set->header.va;
3103 uptr[0] = set_va & 0xffffffff;
3104 }
3105
3106 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3107 va += offset;
3108
3109 if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
3110 if (pipeline->shaders[MESA_SHADER_VERTEX])
3111 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX,
3112 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3113
3114 if (pipeline->shaders[MESA_SHADER_FRAGMENT])
3115 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT,
3116 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3117
3118 if (radv_pipeline_has_gs(pipeline))
3119 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_GEOMETRY,
3120 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3121
3122 if (radv_pipeline_has_tess(pipeline))
3123 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_CTRL,
3124 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3125
3126 if (radv_pipeline_has_tess(pipeline))
3127 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_EVAL,
3128 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3129 } else {
3130 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_COMPUTE,
3131 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3132 }
3133 }
3134
3135 static void
radv_flush_descriptors(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3136 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3137 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3138 {
3139 struct radv_descriptor_state *descriptors_state =
3140 radv_get_descriptors_state(cmd_buffer, bind_point);
3141 bool flush_indirect_descriptors;
3142
3143 if (!descriptors_state->dirty)
3144 return;
3145
3146 if (descriptors_state->push_dirty)
3147 radv_flush_push_descriptors(cmd_buffer, bind_point);
3148
3149 flush_indirect_descriptors = pipeline && pipeline->need_indirect_descriptor_sets;
3150
3151 if (flush_indirect_descriptors)
3152 radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point);
3153
3154 ASSERTED unsigned cdw_max =
3155 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SETS * MESA_SHADER_STAGES * 4);
3156
3157 if (pipeline) {
3158 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
3159 radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state,
3160 MESA_SHADER_COMPUTE);
3161 } else {
3162 radv_foreach_stage(stage, stages)
3163 {
3164 if (!cmd_buffer->state.pipeline->shaders[stage])
3165 continue;
3166
3167 radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state, stage);
3168 }
3169 }
3170 }
3171
3172 descriptors_state->dirty = 0;
3173 descriptors_state->push_dirty = false;
3174
3175 assert(cmd_buffer->cs->cdw <= cdw_max);
3176
3177 if (unlikely(cmd_buffer->device->trace_bo))
3178 radv_save_descriptors(cmd_buffer, bind_point);
3179 }
3180
3181 static bool
radv_shader_loads_push_constants(struct radv_pipeline * pipeline,gl_shader_stage stage)3182 radv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage)
3183 {
3184 struct radv_userdata_info *loc =
3185 radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS);
3186 return loc->sgpr_idx != -1;
3187 }
3188
3189 static void
radv_flush_constants(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3190 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3191 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3192 {
3193 struct radv_descriptor_state *descriptors_state =
3194 radv_get_descriptors_state(cmd_buffer, bind_point);
3195 struct radv_shader_variant *shader, *prev_shader;
3196 bool need_push_constants = false;
3197 unsigned offset;
3198 void *ptr;
3199 uint64_t va;
3200 uint32_t internal_stages;
3201 uint32_t dirty_stages = 0;
3202
3203 stages &= cmd_buffer->push_constant_stages;
3204 if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count))
3205 return;
3206
3207 internal_stages = stages;
3208 switch (bind_point) {
3209 case VK_PIPELINE_BIND_POINT_GRAPHICS:
3210 break;
3211 case VK_PIPELINE_BIND_POINT_COMPUTE:
3212 dirty_stages = RADV_RT_STAGE_BITS;
3213 break;
3214 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
3215 internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3216 dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3217 break;
3218 default:
3219 unreachable("Unhandled bind point");
3220 }
3221
3222 radv_foreach_stage(stage, internal_stages)
3223 {
3224 shader = radv_get_shader(pipeline, stage);
3225 if (!shader)
3226 continue;
3227
3228 need_push_constants |= radv_shader_loads_push_constants(pipeline, stage);
3229
3230 uint8_t base = shader->info.min_push_constant_used / 4;
3231
3232 radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
3233 (uint32_t *)&cmd_buffer->push_constants[base * 4]);
3234 }
3235
3236 if (need_push_constants) {
3237 if (!radv_cmd_buffer_upload_alloc(
3238 cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset,
3239 &ptr))
3240 return;
3241
3242 memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size);
3243 memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers,
3244 16 * pipeline->dynamic_offset_count);
3245
3246 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3247 va += offset;
3248
3249 ASSERTED unsigned cdw_max =
3250 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_SHADER_STAGES * 4);
3251
3252 prev_shader = NULL;
3253 radv_foreach_stage(stage, internal_stages)
3254 {
3255 shader = radv_get_shader(pipeline, stage);
3256
3257 /* Avoid redundantly emitting the address for merged stages. */
3258 if (shader && shader != prev_shader) {
3259 radv_emit_userdata_address(cmd_buffer, pipeline, stage, AC_UD_PUSH_CONSTANTS, va);
3260
3261 prev_shader = shader;
3262 }
3263 }
3264 assert(cmd_buffer->cs->cdw <= cdw_max);
3265 }
3266
3267 cmd_buffer->push_constant_stages &= ~stages;
3268 cmd_buffer->push_constant_stages |= dirty_stages;
3269 }
3270
3271 enum radv_dst_sel {
3272 DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
3273 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3274 DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
3275 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3276 DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3277 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3278 DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3279 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3280 DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3281 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
3282 DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3283 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
3284 };
3285
3286 static const uint32_t data_format_dst_sel[] = {
3287 [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001,
3288 [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001,
3289 [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001,
3290 [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01,
3291 [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001,
3292 [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01,
3293 [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1,
3294 [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1,
3295 [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW,
3296 [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW,
3297 [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW,
3298 [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01,
3299 [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW,
3300 [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1,
3301 [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
3302 };
3303
3304 static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)3305 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3306 {
3307 if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
3308 cmd_buffer->state.pipeline->vb_desc_usage_mask) {
3309 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
3310 struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
3311 enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
3312 unsigned vb_offset;
3313 void *vb_ptr;
3314 unsigned desc_index = 0;
3315 uint32_t mask = pipeline->vb_desc_usage_mask;
3316 uint64_t va;
3317 struct radv_vs_input_state *vs_state =
3318 vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
3319
3320 /* allocate some descriptor state for vertex buffers */
3321 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr))
3322 return;
3323
3324 assert(!vs_state || pipeline->use_per_attribute_vb_descs);
3325
3326 while (mask) {
3327 unsigned i = u_bit_scan(&mask);
3328 uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
3329 uint32_t offset, rsrc_word3;
3330 unsigned binding =
3331 vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
3332 : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
3333 struct radv_buffer *buffer = cmd_buffer->vertex_bindings[binding].buffer;
3334 unsigned num_records;
3335 unsigned stride;
3336
3337 if (vs_state) {
3338 unsigned format = vs_state->formats[i];
3339 unsigned dfmt = format & 0xf;
3340 unsigned nfmt = (format >> 4) & 0x7;
3341
3342 rsrc_word3 =
3343 vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
3344
3345 if (chip >= GFX10)
3346 rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
3347 else
3348 rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
3349 } else {
3350 if (chip >= GFX10)
3351 rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
3352 else
3353 rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
3354 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3355 }
3356
3357 if (!buffer) {
3358 if (vs_state) {
3359 /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
3360 * to include the format/word3 so that the alpha channel is 1 for formats without an
3361 * alpha channel.
3362 */
3363 desc[0] = 0;
3364 desc[1] = S_008F04_STRIDE(16);
3365 desc[2] = 0;
3366 desc[3] = rsrc_word3;
3367 } else {
3368 memset(desc, 0, 4 * 4);
3369 }
3370 continue;
3371 }
3372
3373 va = radv_buffer_get_va(buffer->bo);
3374
3375 offset = cmd_buffer->vertex_bindings[binding].offset;
3376 va += offset + buffer->offset;
3377 if (vs_state)
3378 va += vs_state->offsets[i];
3379
3380 if (cmd_buffer->vertex_bindings[binding].size) {
3381 num_records = cmd_buffer->vertex_bindings[binding].size;
3382 } else {
3383 num_records = buffer->size - offset;
3384 }
3385
3386 if (pipeline->graphics.uses_dynamic_stride) {
3387 stride = cmd_buffer->vertex_bindings[binding].stride;
3388 } else {
3389 stride = pipeline->binding_stride[binding];
3390 }
3391
3392 if (pipeline->use_per_attribute_vb_descs) {
3393 uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
3394 : pipeline->attrib_ends[i];
3395
3396 if (num_records < attrib_end) {
3397 num_records = 0; /* not enough space for one vertex */
3398 } else if (stride == 0) {
3399 num_records = 1; /* only one vertex */
3400 } else {
3401 num_records = (num_records - attrib_end) / stride + 1;
3402 /* If attrib_offset>stride, then the compiler will increase the vertex index by
3403 * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
3404 * only allowed with static strides.
3405 */
3406 num_records += pipeline->attrib_index_offset[i];
3407 }
3408
3409 /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
3410 * into bytes in that case. GFX8 always uses bytes.
3411 */
3412 if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
3413 num_records = (num_records - 1) * stride + attrib_end;
3414 } else if (!num_records) {
3415 /* On GFX9, it seems bounds checking is disabled if both
3416 * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
3417 * GFX10.3 but it doesn't hurt.
3418 */
3419 if (vs_state) {
3420 desc[0] = 0;
3421 desc[1] = S_008F04_STRIDE(16);
3422 desc[2] = 0;
3423 desc[3] = rsrc_word3;
3424 } else {
3425 memset(desc, 0, 16);
3426 }
3427 continue;
3428 }
3429 } else {
3430 if (chip != GFX8 && stride)
3431 num_records = DIV_ROUND_UP(num_records, stride);
3432 }
3433
3434 if (chip >= GFX10) {
3435 /* OOB_SELECT chooses the out-of-bounds check:
3436 * - 1: index >= NUM_RECORDS (Structured)
3437 * - 3: offset >= NUM_RECORDS (Raw)
3438 */
3439 int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
3440 rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1);
3441 }
3442
3443 desc[0] = va;
3444 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
3445 desc[2] = num_records;
3446 desc[3] = rsrc_word3;
3447 }
3448
3449 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3450 va += vb_offset;
3451
3452 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS,
3453 va);
3454
3455 cmd_buffer->state.vb_va = va;
3456 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
3457
3458 if (unlikely(cmd_buffer->device->trace_bo))
3459 radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
3460 }
3461 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
3462 }
3463
3464 static void
radv_emit_streamout_buffers(struct radv_cmd_buffer * cmd_buffer,uint64_t va)3465 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
3466 {
3467 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
3468 struct radv_userdata_info *loc;
3469 uint32_t base_reg;
3470
3471 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
3472 if (!radv_get_shader(pipeline, stage))
3473 continue;
3474
3475 loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_STREAMOUT_BUFFERS);
3476 if (loc->sgpr_idx == -1)
3477 continue;
3478
3479 base_reg = pipeline->user_data_0[stage];
3480
3481 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
3482 false);
3483 }
3484
3485 if (radv_pipeline_has_gs_copy_shader(pipeline)) {
3486 loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
3487 if (loc->sgpr_idx != -1) {
3488 base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
3489
3490 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3491 va, false);
3492 }
3493 }
3494 }
3495
3496 static void
radv_flush_streamout_descriptors(struct radv_cmd_buffer * cmd_buffer)3497 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
3498 {
3499 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
3500 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
3501 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
3502 unsigned so_offset;
3503 void *so_ptr;
3504 uint64_t va;
3505
3506 /* Allocate some descriptor state for streamout buffers. */
3507 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
3508 return;
3509
3510 for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
3511 struct radv_buffer *buffer = sb[i].buffer;
3512 uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
3513
3514 if (!(so->enabled_mask & (1 << i)))
3515 continue;
3516
3517 va = radv_buffer_get_va(buffer->bo) + buffer->offset;
3518
3519 va += sb[i].offset;
3520
3521 /* Set the descriptor.
3522 *
3523 * On GFX8, the format must be non-INVALID, otherwise
3524 * the buffer will be considered not bound and store
3525 * instructions will be no-ops.
3526 */
3527 uint32_t size = 0xffffffff;
3528
3529 /* Compute the correct buffer size for NGG streamout
3530 * because it's used to determine the max emit per
3531 * buffer.
3532 */
3533 if (cmd_buffer->device->physical_device->use_ngg_streamout)
3534 size = buffer->size - sb[i].offset;
3535
3536 uint32_t rsrc_word3 =
3537 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3538 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3539
3540 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
3541 rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
3542 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
3543 } else {
3544 rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3545 }
3546
3547 desc[0] = va;
3548 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3549 desc[2] = size;
3550 desc[3] = rsrc_word3;
3551 }
3552
3553 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3554 va += so_offset;
3555
3556 radv_emit_streamout_buffers(cmd_buffer, va);
3557 }
3558
3559 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
3560 }
3561
3562 static void
radv_flush_ngg_gs_state(struct radv_cmd_buffer * cmd_buffer)3563 radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer)
3564 {
3565 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
3566 struct radv_userdata_info *loc;
3567 uint32_t ngg_gs_state = 0;
3568 uint32_t base_reg;
3569
3570 if (!radv_pipeline_has_gs(pipeline) || !pipeline->graphics.is_ngg)
3571 return;
3572
3573 /* By default NGG GS queries are disabled but they are enabled if the
3574 * command buffer has active GDS queries or if it's a secondary command
3575 * buffer that inherits the number of generated primitives.
3576 */
3577 if (cmd_buffer->state.active_pipeline_gds_queries ||
3578 (cmd_buffer->state.inherited_pipeline_statistics &
3579 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
3580 ngg_gs_state = 1;
3581
3582 loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY, AC_UD_NGG_GS_STATE);
3583 base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY];
3584 assert(loc->sgpr_idx != -1);
3585
3586 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_gs_state);
3587 }
3588
3589 static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)3590 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3591 {
3592 radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
3593 radv_flush_streamout_descriptors(cmd_buffer);
3594 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline,
3595 VK_PIPELINE_BIND_POINT_GRAPHICS);
3596 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline,
3597 VK_PIPELINE_BIND_POINT_GRAPHICS);
3598 radv_flush_ngg_gs_state(cmd_buffer);
3599 }
3600
3601 struct radv_draw_info {
3602 /**
3603 * Number of vertices.
3604 */
3605 uint32_t count;
3606
3607 /**
3608 * First instance id.
3609 */
3610 uint32_t first_instance;
3611
3612 /**
3613 * Number of instances.
3614 */
3615 uint32_t instance_count;
3616
3617 /**
3618 * Whether it's an indexed draw.
3619 */
3620 bool indexed;
3621
3622 /**
3623 * Indirect draw parameters resource.
3624 */
3625 struct radv_buffer *indirect;
3626 uint64_t indirect_offset;
3627 uint32_t stride;
3628
3629 /**
3630 * Draw count parameters resource.
3631 */
3632 struct radv_buffer *count_buffer;
3633 uint64_t count_buffer_offset;
3634
3635 /**
3636 * Stream output parameters resource.
3637 */
3638 struct radv_buffer *strmout_buffer;
3639 uint64_t strmout_buffer_offset;
3640 };
3641
3642 static uint32_t
radv_get_primitive_reset_index(struct radv_cmd_buffer * cmd_buffer)3643 radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
3644 {
3645 switch (cmd_buffer->state.index_type) {
3646 case V_028A7C_VGT_INDEX_8:
3647 return 0xffu;
3648 case V_028A7C_VGT_INDEX_16:
3649 return 0xffffu;
3650 case V_028A7C_VGT_INDEX_32:
3651 return 0xffffffffu;
3652 default:
3653 unreachable("invalid index type");
3654 }
3655 }
3656
3657 static void
si_emit_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count)3658 si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw,
3659 bool indirect_draw, bool count_from_stream_output,
3660 uint32_t draw_vertex_count)
3661 {
3662 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
3663 struct radv_cmd_state *state = &cmd_buffer->state;
3664 unsigned topology = state->dynamic.primitive_topology;
3665 bool prim_restart_enable = state->dynamic.primitive_restart_enable;
3666 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3667 unsigned ia_multi_vgt_param;
3668
3669 ia_multi_vgt_param =
3670 si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
3671 draw_vertex_count, topology, prim_restart_enable);
3672
3673 if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
3674 if (info->chip_class == GFX9) {
3675 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
3676 R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
3677 } else if (info->chip_class >= GFX7) {
3678 radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
3679 } else {
3680 radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
3681 }
3682 state->last_ia_multi_vgt_param = ia_multi_vgt_param;
3683 }
3684 }
3685
3686 static void
radv_emit_draw_registers(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)3687 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
3688 {
3689 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
3690 struct radv_cmd_state *state = &cmd_buffer->state;
3691 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3692
3693 /* Draw state. */
3694 if (info->chip_class < GFX10) {
3695 si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
3696 !!draw_info->strmout_buffer,
3697 draw_info->indirect ? 0 : draw_info->count);
3698 }
3699
3700 if (state->dynamic.primitive_restart_enable) {
3701 uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
3702
3703 if (primitive_reset_index != state->last_primitive_reset_index) {
3704 radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
3705 state->last_primitive_reset_index = primitive_reset_index;
3706 }
3707 }
3708
3709 if (draw_info->strmout_buffer) {
3710 uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
3711
3712 va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
3713
3714 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
3715
3716 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
3717 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
3718 COPY_DATA_WR_CONFIRM);
3719 radeon_emit(cs, va);
3720 radeon_emit(cs, va >> 32);
3721 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
3722 radeon_emit(cs, 0); /* unused */
3723
3724 radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
3725 }
3726 }
3727
3728 static void
radv_stage_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags src_stage_mask)3729 radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_stage_mask)
3730 {
3731 if (src_stage_mask &
3732 (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT |
3733 VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
3734 VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
3735 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
3736 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
3737 }
3738
3739 if (src_stage_mask &
3740 (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
3741 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
3742 VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
3743 VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
3744 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
3745 } else if (src_stage_mask &
3746 (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
3747 VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
3748 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
3749 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
3750 VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
3751 VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) {
3752 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
3753 }
3754 }
3755
3756 static bool
can_skip_buffer_l2_flushes(struct radv_device * device)3757 can_skip_buffer_l2_flushes(struct radv_device *device)
3758 {
3759 return device->physical_device->rad_info.chip_class == GFX9 ||
3760 (device->physical_device->rad_info.chip_class >= GFX10 &&
3761 !device->physical_device->rad_info.tcc_rb_non_coherent);
3762 }
3763
3764 /*
3765 * In vulkan barriers have two kinds of operations:
3766 *
3767 * - visibility (implemented with radv_src_access_flush)
3768 * - availability (implemented with radv_dst_access_flush)
3769 *
3770 * for a memory operation to observe the result of a previous memory operation
3771 * one needs to do a visibility operation from the source memory and then an
3772 * availability operation to the target memory.
3773 *
3774 * The complication is the availability and visibility operations do not need to
3775 * be in the same barrier.
3776 *
3777 * The cleanest way to implement this is to define the visibility operation to
3778 * bring the caches to a "state of rest", which none of the caches below that
3779 * level dirty.
3780 *
3781 * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
3782 *
3783 * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
3784 * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
3785 * images. However, given the existence of memory barriers which do not specify
3786 * the image/buffer it often devolves to just VRAM/GTT anyway.
3787 *
3788 * To help reducing the invalidations for GPUs that have L2 coherency between the
3789 * RB and the shader caches, we always invalidate L2 on the src side, as we can
3790 * use our knowledge of past usage to optimize flushes away.
3791 */
3792
3793 enum radv_cmd_flush_bits
radv_src_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags src_flags,const struct radv_image * image)3794 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flags,
3795 const struct radv_image *image)
3796 {
3797 bool has_CB_meta = true, has_DB_meta = true;
3798 bool image_is_coherent = image ? image->l2_coherent : false;
3799 enum radv_cmd_flush_bits flush_bits = 0;
3800
3801 if (image) {
3802 if (!radv_image_has_CB_metadata(image))
3803 has_CB_meta = false;
3804 if (!radv_image_has_htile(image))
3805 has_DB_meta = false;
3806 }
3807
3808 u_foreach_bit(b, src_flags)
3809 {
3810 switch ((VkAccessFlagBits)(1 << b)) {
3811 case VK_ACCESS_SHADER_WRITE_BIT:
3812 /* since the STORAGE bit isn't set we know that this is a meta operation.
3813 * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
3814 * set it here. */
3815 if (image && !(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
3816 if (vk_format_is_depth_or_stencil(image->vk_format)) {
3817 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3818 } else {
3819 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3820 }
3821 }
3822
3823 /* This is valid even for the rb_noncoherent_dirty case, because with how we account for
3824 * dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need
3825 * invalidating. */
3826 if (!image_is_coherent)
3827 flush_bits |= RADV_CMD_FLAG_WB_L2;
3828 break;
3829 case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
3830 case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3831 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3832 if (!image_is_coherent)
3833 flush_bits |= RADV_CMD_FLAG_WB_L2;
3834 break;
3835 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
3836 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3837 if (has_CB_meta)
3838 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3839 break;
3840 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3841 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3842 if (has_DB_meta)
3843 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3844 break;
3845 case VK_ACCESS_TRANSFER_WRITE_BIT:
3846 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3847
3848 if (!image_is_coherent)
3849 flush_bits |= RADV_CMD_FLAG_INV_L2;
3850 if (has_CB_meta)
3851 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3852 if (has_DB_meta)
3853 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3854 break;
3855 case VK_ACCESS_MEMORY_WRITE_BIT:
3856 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3857
3858 if (!image_is_coherent)
3859 flush_bits |= RADV_CMD_FLAG_INV_L2;
3860 if (has_CB_meta)
3861 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3862 if (has_DB_meta)
3863 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3864 break;
3865 default:
3866 break;
3867 }
3868 }
3869 return flush_bits;
3870 }
3871
3872 enum radv_cmd_flush_bits
radv_dst_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags dst_flags,const struct radv_image * image)3873 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flags,
3874 const struct radv_image *image)
3875 {
3876 bool has_CB_meta = true, has_DB_meta = true;
3877 enum radv_cmd_flush_bits flush_bits = 0;
3878 bool flush_CB = true, flush_DB = true;
3879 bool image_is_coherent = image ? image->l2_coherent : false;
3880
3881 if (image) {
3882 if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
3883 flush_CB = false;
3884 flush_DB = false;
3885 }
3886
3887 if (!radv_image_has_CB_metadata(image))
3888 has_CB_meta = false;
3889 if (!radv_image_has_htile(image))
3890 has_DB_meta = false;
3891 }
3892
3893 /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
3894 * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
3895 image_is_coherent |=
3896 can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty;
3897
3898 u_foreach_bit(b, dst_flags)
3899 {
3900 switch ((VkAccessFlagBits)(1 << b)) {
3901 case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
3902 case VK_ACCESS_INDEX_READ_BIT:
3903 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3904 break;
3905 case VK_ACCESS_UNIFORM_READ_BIT:
3906 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
3907 break;
3908 case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
3909 case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
3910 case VK_ACCESS_TRANSFER_READ_BIT:
3911 case VK_ACCESS_TRANSFER_WRITE_BIT:
3912 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
3913
3914 if (has_CB_meta || has_DB_meta)
3915 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
3916 if (!image_is_coherent)
3917 flush_bits |= RADV_CMD_FLAG_INV_L2;
3918 break;
3919 case VK_ACCESS_SHADER_READ_BIT:
3920 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
3921 /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
3922 * invalidate the scalar cache. */
3923 if (!cmd_buffer->device->physical_device->use_llvm && !image)
3924 flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
3925
3926 if (has_CB_meta || has_DB_meta)
3927 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
3928 if (!image_is_coherent)
3929 flush_bits |= RADV_CMD_FLAG_INV_L2;
3930 break;
3931 case VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_KHR:
3932 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
3933 if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
3934 flush_bits |= RADV_CMD_FLAG_INV_L2;
3935 break;
3936 case VK_ACCESS_SHADER_WRITE_BIT:
3937 case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
3938 break;
3939 case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
3940 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
3941 if (flush_CB)
3942 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3943 if (has_CB_meta)
3944 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3945 break;
3946 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
3947 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3948 if (flush_DB)
3949 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3950 if (has_DB_meta)
3951 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3952 break;
3953 case VK_ACCESS_MEMORY_READ_BIT:
3954 case VK_ACCESS_MEMORY_WRITE_BIT:
3955 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
3956 if (!image_is_coherent)
3957 flush_bits |= RADV_CMD_FLAG_INV_L2;
3958 if (flush_CB)
3959 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3960 if (has_CB_meta)
3961 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3962 if (flush_DB)
3963 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3964 if (has_DB_meta)
3965 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3966 break;
3967 default:
3968 break;
3969 }
3970 }
3971 return flush_bits;
3972 }
3973
3974 void
radv_emit_subpass_barrier(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass_barrier * barrier)3975 radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier)
3976 {
3977 struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
3978 if (fb && !fb->imageless) {
3979 for (int i = 0; i < fb->attachment_count; ++i) {
3980 cmd_buffer->state.flush_bits |=
3981 radv_src_access_flush(cmd_buffer, barrier->src_access_mask, fb->attachments[i]->image);
3982 }
3983 } else {
3984 cmd_buffer->state.flush_bits |=
3985 radv_src_access_flush(cmd_buffer, barrier->src_access_mask, NULL);
3986 }
3987
3988 radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
3989
3990 if (fb && !fb->imageless) {
3991 for (int i = 0; i < fb->attachment_count; ++i) {
3992 cmd_buffer->state.flush_bits |=
3993 radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, fb->attachments[i]->image);
3994 }
3995 } else {
3996 cmd_buffer->state.flush_bits |=
3997 radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, NULL);
3998 }
3999 }
4000
4001 uint32_t
radv_get_subpass_id(struct radv_cmd_buffer * cmd_buffer)4002 radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
4003 {
4004 struct radv_cmd_state *state = &cmd_buffer->state;
4005 uint32_t subpass_id = state->subpass - state->pass->subpasses;
4006
4007 /* The id of this subpass shouldn't exceed the number of subpasses in
4008 * this render pass minus 1.
4009 */
4010 assert(subpass_id < state->pass->subpass_count);
4011 return subpass_id;
4012 }
4013
4014 static struct radv_sample_locations_state *
radv_get_attachment_sample_locations(struct radv_cmd_buffer * cmd_buffer,uint32_t att_idx,bool begin_subpass)4015 radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx,
4016 bool begin_subpass)
4017 {
4018 struct radv_cmd_state *state = &cmd_buffer->state;
4019 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
4020 struct radv_image_view *view = state->attachments[att_idx].iview;
4021
4022 if (view->image->info.samples == 1)
4023 return NULL;
4024
4025 if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
4026 /* Return the initial sample locations if this is the initial
4027 * layout transition of the given subpass attachemnt.
4028 */
4029 if (state->attachments[att_idx].sample_location.count > 0)
4030 return &state->attachments[att_idx].sample_location;
4031 } else {
4032 /* Otherwise return the subpass sample locations if defined. */
4033 if (state->subpass_sample_locs) {
4034 /* Because the driver sets the current subpass before
4035 * initial layout transitions, we should use the sample
4036 * locations from the previous subpass to avoid an
4037 * off-by-one problem. Otherwise, use the sample
4038 * locations for the current subpass for final layout
4039 * transitions.
4040 */
4041 if (begin_subpass)
4042 subpass_id--;
4043
4044 for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
4045 if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
4046 return &state->subpass_sample_locs[i].sample_location;
4047 }
4048 }
4049 }
4050
4051 return NULL;
4052 }
4053
4054 static void
radv_handle_subpass_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_subpass_attachment att,bool begin_subpass)4055 radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
4056 struct radv_subpass_attachment att, bool begin_subpass)
4057 {
4058 unsigned idx = att.attachment;
4059 struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
4060 struct radv_sample_locations_state *sample_locs;
4061 VkImageSubresourceRange range;
4062 range.aspectMask = view->aspect_mask;
4063 range.baseMipLevel = view->base_mip;
4064 range.levelCount = 1;
4065 range.baseArrayLayer = view->base_layer;
4066 range.layerCount = cmd_buffer->state.framebuffer->layers;
4067
4068 if (cmd_buffer->state.subpass->view_mask) {
4069 /* If the current subpass uses multiview, the driver might have
4070 * performed a fast color/depth clear to the whole image
4071 * (including all layers). To make sure the driver will
4072 * decompress the image correctly (if needed), we have to
4073 * account for the "real" number of layers. If the view mask is
4074 * sparse, this will decompress more layers than needed.
4075 */
4076 range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
4077 }
4078
4079 /* Get the subpass sample locations for the given attachment, if NULL
4080 * is returned the driver will use the default HW locations.
4081 */
4082 sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass);
4083
4084 /* Determine if the subpass uses separate depth/stencil layouts. */
4085 bool uses_separate_depth_stencil_layouts = false;
4086 if ((cmd_buffer->state.attachments[idx].current_layout !=
4087 cmd_buffer->state.attachments[idx].current_stencil_layout) ||
4088 (att.layout != att.stencil_layout)) {
4089 uses_separate_depth_stencil_layouts = true;
4090 }
4091
4092 /* For separate layouts, perform depth and stencil transitions
4093 * separately.
4094 */
4095 if (uses_separate_depth_stencil_layouts &&
4096 (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
4097 /* Depth-only transitions. */
4098 range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
4099 radv_handle_image_transition(cmd_buffer, view->image,
4100 cmd_buffer->state.attachments[idx].current_layout,
4101 cmd_buffer->state.attachments[idx].current_in_render_loop,
4102 att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
4103
4104 /* Stencil-only transitions. */
4105 range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
4106 radv_handle_image_transition(
4107 cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_stencil_layout,
4108 cmd_buffer->state.attachments[idx].current_in_render_loop, att.stencil_layout,
4109 att.in_render_loop, 0, 0, &range, sample_locs);
4110 } else {
4111 radv_handle_image_transition(cmd_buffer, view->image,
4112 cmd_buffer->state.attachments[idx].current_layout,
4113 cmd_buffer->state.attachments[idx].current_in_render_loop,
4114 att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
4115 }
4116
4117 cmd_buffer->state.attachments[idx].current_layout = att.layout;
4118 cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout;
4119 cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop;
4120 }
4121
4122 void
radv_cmd_buffer_set_subpass(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass * subpass)4123 radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass *subpass)
4124 {
4125 cmd_buffer->state.subpass = subpass;
4126
4127 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
4128 }
4129
4130 static VkResult
radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer * cmd_buffer,struct radv_render_pass * pass,const VkRenderPassBeginInfo * info)4131 radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
4132 struct radv_render_pass *pass,
4133 const VkRenderPassBeginInfo *info)
4134 {
4135 const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
4136 vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
4137 struct radv_cmd_state *state = &cmd_buffer->state;
4138
4139 if (!sample_locs) {
4140 state->subpass_sample_locs = NULL;
4141 return VK_SUCCESS;
4142 }
4143
4144 for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
4145 const VkAttachmentSampleLocationsEXT *att_sample_locs =
4146 &sample_locs->pAttachmentInitialSampleLocations[i];
4147 uint32_t att_idx = att_sample_locs->attachmentIndex;
4148 struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
4149
4150 assert(vk_format_is_depth_or_stencil(image->vk_format));
4151
4152 /* From the Vulkan spec 1.1.108:
4153 *
4154 * "If the image referenced by the framebuffer attachment at
4155 * index attachmentIndex was not created with
4156 * VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
4157 * then the values specified in sampleLocationsInfo are
4158 * ignored."
4159 */
4160 if (!(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
4161 continue;
4162
4163 const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo;
4164
4165 state->attachments[att_idx].sample_location.per_pixel =
4166 sample_locs_info->sampleLocationsPerPixel;
4167 state->attachments[att_idx].sample_location.grid_size =
4168 sample_locs_info->sampleLocationGridSize;
4169 state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount;
4170 typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
4171 sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4172 }
4173
4174 state->subpass_sample_locs =
4175 vk_alloc(&cmd_buffer->pool->alloc,
4176 sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]),
4177 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4178 if (state->subpass_sample_locs == NULL) {
4179 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4180 return cmd_buffer->record_result;
4181 }
4182
4183 state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
4184
4185 for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
4186 const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
4187 &sample_locs->pPostSubpassSampleLocations[i];
4188 const VkSampleLocationsInfoEXT *sample_locs_info =
4189 &subpass_sample_locs_info->sampleLocationsInfo;
4190
4191 state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex;
4192 state->subpass_sample_locs[i].sample_location.per_pixel =
4193 sample_locs_info->sampleLocationsPerPixel;
4194 state->subpass_sample_locs[i].sample_location.grid_size =
4195 sample_locs_info->sampleLocationGridSize;
4196 state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount;
4197 typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
4198 sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4199 }
4200
4201 return VK_SUCCESS;
4202 }
4203
4204 static VkResult
radv_cmd_state_setup_attachments(struct radv_cmd_buffer * cmd_buffer,struct radv_render_pass * pass,const VkRenderPassBeginInfo * info,const struct radv_extra_render_pass_begin_info * extra)4205 radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass,
4206 const VkRenderPassBeginInfo *info,
4207 const struct radv_extra_render_pass_begin_info *extra)
4208 {
4209 struct radv_cmd_state *state = &cmd_buffer->state;
4210 const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL;
4211
4212 if (info) {
4213 attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
4214 }
4215
4216 if (pass->attachment_count == 0) {
4217 state->attachments = NULL;
4218 return VK_SUCCESS;
4219 }
4220
4221 state->attachments =
4222 vk_alloc(&cmd_buffer->pool->alloc, pass->attachment_count * sizeof(state->attachments[0]), 8,
4223 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4224 if (state->attachments == NULL) {
4225 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4226 return cmd_buffer->record_result;
4227 }
4228
4229 for (uint32_t i = 0; i < pass->attachment_count; ++i) {
4230 struct radv_render_pass_attachment *att = &pass->attachments[i];
4231 VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
4232 VkImageAspectFlags clear_aspects = 0;
4233
4234 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
4235 /* color attachment */
4236 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4237 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
4238 }
4239 } else {
4240 /* depthstencil attachment */
4241 if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
4242 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4243 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
4244 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4245 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
4246 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4247 }
4248 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4249 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4250 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4251 }
4252 }
4253
4254 state->attachments[i].pending_clear_aspects = clear_aspects;
4255 state->attachments[i].cleared_views = 0;
4256 if (clear_aspects && info) {
4257 assert(info->clearValueCount > i);
4258 state->attachments[i].clear_value = info->pClearValues[i];
4259 }
4260
4261 state->attachments[i].current_layout = att->initial_layout;
4262 state->attachments[i].current_in_render_loop = false;
4263 state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
4264 state->attachments[i].disable_dcc = extra && extra->disable_dcc;
4265 state->attachments[i].sample_location.count = 0;
4266
4267 struct radv_image_view *iview;
4268 if (attachment_info && attachment_info->attachmentCount > i) {
4269 iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
4270 } else {
4271 iview = state->framebuffer->attachments[i];
4272 }
4273
4274 state->attachments[i].iview = iview;
4275 if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4276 radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
4277 } else {
4278 radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
4279 }
4280 }
4281
4282 return VK_SUCCESS;
4283 }
4284
4285 VkResult
radv_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)4286 radv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo,
4287 VkCommandBuffer *pCommandBuffers)
4288 {
4289 RADV_FROM_HANDLE(radv_device, device, _device);
4290 RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
4291
4292 VkResult result = VK_SUCCESS;
4293 uint32_t i;
4294
4295 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
4296
4297 if (!list_is_empty(&pool->free_cmd_buffers)) {
4298 struct radv_cmd_buffer *cmd_buffer =
4299 list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
4300
4301 list_del(&cmd_buffer->pool_link);
4302 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
4303
4304 result = radv_reset_cmd_buffer(cmd_buffer);
4305 cmd_buffer->level = pAllocateInfo->level;
4306 vk_command_buffer_finish(&cmd_buffer->vk);
4307 VkResult init_result =
4308 vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
4309 if (init_result != VK_SUCCESS)
4310 result = init_result;
4311
4312 pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
4313 } else {
4314 result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]);
4315 }
4316 if (result != VK_SUCCESS)
4317 break;
4318 }
4319
4320 if (result != VK_SUCCESS) {
4321 radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers);
4322
4323 /* From the Vulkan 1.0.66 spec:
4324 *
4325 * "vkAllocateCommandBuffers can be used to create multiple
4326 * command buffers. If the creation of any of those command
4327 * buffers fails, the implementation must destroy all
4328 * successfully created command buffer objects from this
4329 * command, set all entries of the pCommandBuffers array to
4330 * NULL and return the error."
4331 */
4332 memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
4333 }
4334
4335 return result;
4336 }
4337
4338 void
radv_FreeCommandBuffers(VkDevice device,VkCommandPool commandPool,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)4339 radv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount,
4340 const VkCommandBuffer *pCommandBuffers)
4341 {
4342 for (uint32_t i = 0; i < commandBufferCount; i++) {
4343 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
4344
4345 if (cmd_buffer) {
4346 if (cmd_buffer->pool) {
4347 list_del(&cmd_buffer->pool_link);
4348 list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers);
4349 } else
4350 radv_destroy_cmd_buffer(cmd_buffer);
4351 }
4352 }
4353 }
4354
4355 VkResult
radv_ResetCommandBuffer(VkCommandBuffer commandBuffer,VkCommandBufferResetFlags flags)4356 radv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags)
4357 {
4358 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4359 return radv_reset_cmd_buffer(cmd_buffer);
4360 }
4361
4362 VkResult
radv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)4363 radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
4364 {
4365 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4366 VkResult result = VK_SUCCESS;
4367
4368 if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
4369 /* If the command buffer has already been resetted with
4370 * vkResetCommandBuffer, no need to do it again.
4371 */
4372 result = radv_reset_cmd_buffer(cmd_buffer);
4373 if (result != VK_SUCCESS)
4374 return result;
4375 }
4376
4377 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
4378 cmd_buffer->state.last_primitive_reset_en = -1;
4379 cmd_buffer->state.last_index_type = -1;
4380 cmd_buffer->state.last_num_instances = -1;
4381 cmd_buffer->state.last_vertex_offset = -1;
4382 cmd_buffer->state.last_first_instance = -1;
4383 cmd_buffer->state.last_drawid = -1;
4384 cmd_buffer->state.predication_type = -1;
4385 cmd_buffer->state.last_sx_ps_downconvert = -1;
4386 cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
4387 cmd_buffer->state.last_sx_blend_opt_control = -1;
4388 cmd_buffer->state.last_nggc_settings = -1;
4389 cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
4390 cmd_buffer->usage_flags = pBeginInfo->flags;
4391
4392 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
4393 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
4394 assert(pBeginInfo->pInheritanceInfo);
4395 cmd_buffer->state.framebuffer =
4396 radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
4397 cmd_buffer->state.pass =
4398 radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
4399
4400 struct radv_subpass *subpass =
4401 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
4402
4403 if (cmd_buffer->state.framebuffer) {
4404 result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL, NULL);
4405 if (result != VK_SUCCESS)
4406 return result;
4407 }
4408
4409 cmd_buffer->state.inherited_pipeline_statistics =
4410 pBeginInfo->pInheritanceInfo->pipelineStatistics;
4411
4412 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
4413 }
4414
4415 if (unlikely(cmd_buffer->device->trace_bo))
4416 radv_cmd_buffer_trace_emit(cmd_buffer);
4417
4418 radv_describe_begin_cmd_buffer(cmd_buffer);
4419
4420 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
4421
4422 return result;
4423 }
4424
4425 void
radv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)4426 radv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, uint32_t firstBinding,
4427 uint32_t bindingCount, const VkBuffer *pBuffers,
4428 const VkDeviceSize *pOffsets)
4429 {
4430 radv_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount, pBuffers, pOffsets,
4431 NULL, NULL);
4432 }
4433
4434 void
radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)4435 radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
4436 uint32_t bindingCount, const VkBuffer *pBuffers,
4437 const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
4438 const VkDeviceSize *pStrides)
4439 {
4440 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4441 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
4442 struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
4443 bool changed = false;
4444
4445 /* We have to defer setting up vertex buffer since we need the buffer
4446 * stride from the pipeline. */
4447
4448 assert(firstBinding + bindingCount <= MAX_VBS);
4449 cmd_buffer->state.vbo_misaligned_mask = state->misaligned_mask;
4450 enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
4451 for (uint32_t i = 0; i < bindingCount; i++) {
4452 RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
4453 uint32_t idx = firstBinding + i;
4454 VkDeviceSize size = pSizes ? pSizes[i] : 0;
4455 VkDeviceSize stride = pStrides ? pStrides[i] : 0;
4456
4457 /* pSizes and pStrides are optional. */
4458 if (!changed && (vb[idx].buffer != buffer || vb[idx].offset != pOffsets[i] ||
4459 vb[idx].size != size || (pStrides && vb[idx].stride != stride))) {
4460 changed = true;
4461 }
4462
4463 vb[idx].buffer = buffer;
4464 vb[idx].offset = pOffsets[i];
4465 vb[idx].size = size;
4466 /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
4467
4468 if (chip == GFX6 || chip >= GFX10) {
4469 const uint32_t bit = 1u << idx;
4470 if (!buffer) {
4471 cmd_buffer->state.vbo_misaligned_mask &= ~bit;
4472 cmd_buffer->state.vbo_bound_mask &= ~bit;
4473 } else {
4474 cmd_buffer->state.vbo_bound_mask |= bit;
4475 if (pStrides && vb[idx].stride != stride) {
4476 if (stride & state->format_align_req_minus_1[idx])
4477 cmd_buffer->state.vbo_misaligned_mask |= bit;
4478 else
4479 cmd_buffer->state.vbo_misaligned_mask &= ~bit;
4480 }
4481 if (state->possibly_misaligned_mask & bit &&
4482 (vb[idx].offset + state->offsets[idx]) & state->format_align_req_minus_1[idx])
4483 cmd_buffer->state.vbo_misaligned_mask |= bit;
4484 }
4485 }
4486
4487 if (pStrides)
4488 vb[idx].stride = stride;
4489
4490 if (buffer) {
4491 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, vb[idx].buffer->bo);
4492 }
4493 }
4494
4495 if (!changed) {
4496 /* No state changes. */
4497 return;
4498 }
4499
4500 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
4501 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
4502 }
4503
4504 static uint32_t
vk_to_index_type(VkIndexType type)4505 vk_to_index_type(VkIndexType type)
4506 {
4507 switch (type) {
4508 case VK_INDEX_TYPE_UINT8_EXT:
4509 return V_028A7C_VGT_INDEX_8;
4510 case VK_INDEX_TYPE_UINT16:
4511 return V_028A7C_VGT_INDEX_16;
4512 case VK_INDEX_TYPE_UINT32:
4513 return V_028A7C_VGT_INDEX_32;
4514 default:
4515 unreachable("invalid index type");
4516 }
4517 }
4518
4519 static uint32_t
radv_get_vgt_index_size(uint32_t type)4520 radv_get_vgt_index_size(uint32_t type)
4521 {
4522 switch (type) {
4523 case V_028A7C_VGT_INDEX_8:
4524 return 1;
4525 case V_028A7C_VGT_INDEX_16:
4526 return 2;
4527 case V_028A7C_VGT_INDEX_32:
4528 return 4;
4529 default:
4530 unreachable("invalid index type");
4531 }
4532 }
4533
4534 void
radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)4535 radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
4536 VkIndexType indexType)
4537 {
4538 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4539 RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
4540
4541 if (cmd_buffer->state.index_buffer == index_buffer && cmd_buffer->state.index_offset == offset &&
4542 cmd_buffer->state.index_type == indexType) {
4543 /* No state changes. */
4544 return;
4545 }
4546
4547 cmd_buffer->state.index_buffer = index_buffer;
4548 cmd_buffer->state.index_offset = offset;
4549 cmd_buffer->state.index_type = vk_to_index_type(indexType);
4550 cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
4551 cmd_buffer->state.index_va += index_buffer->offset + offset;
4552
4553 int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
4554 cmd_buffer->state.max_index_count = (index_buffer->size - offset) / index_size;
4555 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
4556 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
4557 }
4558
4559 static void
radv_bind_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)4560 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
4561 struct radv_descriptor_set *set, unsigned idx)
4562 {
4563 struct radeon_winsys *ws = cmd_buffer->device->ws;
4564
4565 radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
4566
4567 assert(set);
4568
4569 if (!cmd_buffer->device->use_global_bo_list) {
4570 for (unsigned j = 0; j < set->header.buffer_count; ++j)
4571 if (set->descriptors[j])
4572 radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
4573 }
4574
4575 if (set->header.bo)
4576 radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
4577 }
4578
4579 void
radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)4580 radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
4581 VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount,
4582 const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount,
4583 const uint32_t *pDynamicOffsets)
4584 {
4585 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4586 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4587 unsigned dyn_idx = 0;
4588
4589 const bool no_dynamic_bounds =
4590 cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
4591 struct radv_descriptor_state *descriptors_state =
4592 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
4593
4594 for (unsigned i = 0; i < descriptorSetCount; ++i) {
4595 unsigned set_idx = i + firstSet;
4596 RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
4597
4598 /* If the set is already bound we only need to update the
4599 * (potentially changed) dynamic offsets. */
4600 if (descriptors_state->sets[set_idx] != set ||
4601 !(descriptors_state->valid & (1u << set_idx))) {
4602 radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx);
4603 }
4604
4605 for (unsigned j = 0; j < layout->set[set_idx].dynamic_offset_count; ++j, ++dyn_idx) {
4606 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
4607 uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
4608 assert(dyn_idx < dynamicOffsetCount);
4609
4610 struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
4611
4612 if (!range->va) {
4613 memset(dst, 0, 4 * 4);
4614 } else {
4615 uint64_t va = range->va + pDynamicOffsets[dyn_idx];
4616 dst[0] = va;
4617 dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
4618 dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
4619 dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4620 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
4621
4622 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
4623 dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
4624 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
4625 } else {
4626 dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4627 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4628 }
4629 }
4630
4631 cmd_buffer->push_constant_stages |= layout->set[set_idx].dynamic_offset_stages;
4632 }
4633 }
4634 }
4635
4636 static bool
radv_init_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_set * set,struct radv_descriptor_set_layout * layout,VkPipelineBindPoint bind_point)4637 radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
4638 struct radv_descriptor_set_layout *layout,
4639 VkPipelineBindPoint bind_point)
4640 {
4641 struct radv_descriptor_state *descriptors_state =
4642 radv_get_descriptors_state(cmd_buffer, bind_point);
4643 set->header.size = layout->size;
4644 set->header.layout = layout;
4645
4646 if (descriptors_state->push_set.capacity < set->header.size) {
4647 size_t new_size = MAX2(set->header.size, 1024);
4648 new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
4649 new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
4650
4651 free(set->header.mapped_ptr);
4652 set->header.mapped_ptr = malloc(new_size);
4653
4654 if (!set->header.mapped_ptr) {
4655 descriptors_state->push_set.capacity = 0;
4656 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4657 return false;
4658 }
4659
4660 descriptors_state->push_set.capacity = new_size;
4661 }
4662
4663 return true;
4664 }
4665
4666 void
radv_meta_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)4667 radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
4668 VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout,
4669 uint32_t set, uint32_t descriptorWriteCount,
4670 const VkWriteDescriptorSet *pDescriptorWrites)
4671 {
4672 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4673 struct radv_descriptor_set *push_set =
4674 (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
4675 unsigned bo_offset;
4676
4677 assert(set == 0);
4678 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4679
4680 push_set->header.size = layout->set[set].layout->size;
4681 push_set->header.layout = layout->set[set].layout;
4682
4683 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
4684 (void **)&push_set->header.mapped_ptr))
4685 return;
4686
4687 push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
4688 push_set->header.va += bo_offset;
4689
4690 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
4691 radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
4692 pDescriptorWrites, 0, NULL);
4693
4694 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
4695 }
4696
4697 void
radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)4698 radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
4699 VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
4700 const VkWriteDescriptorSet *pDescriptorWrites)
4701 {
4702 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4703 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4704 struct radv_descriptor_state *descriptors_state =
4705 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
4706 struct radv_descriptor_set *push_set =
4707 (struct radv_descriptor_set *)&descriptors_state->push_set.set;
4708
4709 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4710
4711 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
4712 pipelineBindPoint))
4713 return;
4714
4715 /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
4716 * because it is invalid, according to Vulkan spec.
4717 */
4718 for (int i = 0; i < descriptorWriteCount; i++) {
4719 ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
4720 assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
4721 }
4722
4723 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
4724 radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
4725 pDescriptorWrites, 0, NULL);
4726
4727 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
4728 descriptors_state->push_dirty = true;
4729 }
4730
4731 void
radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,VkDescriptorUpdateTemplate descriptorUpdateTemplate,VkPipelineLayout _layout,uint32_t set,const void * pData)4732 radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
4733 VkDescriptorUpdateTemplate descriptorUpdateTemplate,
4734 VkPipelineLayout _layout, uint32_t set, const void *pData)
4735 {
4736 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4737 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4738 RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
4739 struct radv_descriptor_state *descriptors_state =
4740 radv_get_descriptors_state(cmd_buffer, templ->bind_point);
4741 struct radv_descriptor_set *push_set =
4742 (struct radv_descriptor_set *)&descriptors_state->push_set.set;
4743
4744 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4745
4746 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
4747 templ->bind_point))
4748 return;
4749
4750 radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
4751 descriptorUpdateTemplate, pData);
4752
4753 radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
4754 descriptors_state->push_dirty = true;
4755 }
4756
4757 void
radv_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)4758 radv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
4759 VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
4760 const void *pValues)
4761 {
4762 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4763 memcpy(cmd_buffer->push_constants + offset, pValues, size);
4764 cmd_buffer->push_constant_stages |= stageFlags;
4765 }
4766
4767 VkResult
radv_EndCommandBuffer(VkCommandBuffer commandBuffer)4768 radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
4769 {
4770 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4771
4772 radv_emit_mip_change_flush_default(cmd_buffer);
4773
4774 if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
4775 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6)
4776 cmd_buffer->state.flush_bits |=
4777 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
4778
4779 /* Make sure to sync all pending active queries at the end of
4780 * command buffer.
4781 */
4782 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
4783
4784 /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
4785 * command buffer.
4786 */
4787 if (cmd_buffer->state.rb_noncoherent_dirty && can_skip_buffer_l2_flushes(cmd_buffer->device))
4788 cmd_buffer->state.flush_bits |= radv_src_access_flush(
4789 cmd_buffer,
4790 VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
4791 NULL);
4792
4793 /* Since NGG streamout uses GDS, we need to make GDS idle when
4794 * we leave the IB, otherwise another process might overwrite
4795 * it while our shaders are busy.
4796 */
4797 if (cmd_buffer->gds_needed)
4798 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
4799
4800 si_emit_cache_flush(cmd_buffer);
4801 }
4802
4803 /* Make sure CP DMA is idle at the end of IBs because the kernel
4804 * doesn't wait for it.
4805 */
4806 si_cp_dma_wait_for_idle(cmd_buffer);
4807
4808 radv_describe_end_cmd_buffer(cmd_buffer);
4809
4810 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
4811 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
4812
4813 VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
4814 if (result != VK_SUCCESS)
4815 return vk_error(cmd_buffer, result);
4816
4817 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
4818
4819 return cmd_buffer->record_result;
4820 }
4821
4822 static void
radv_emit_compute_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)4823 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
4824 {
4825 if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
4826 return;
4827
4828 assert(!pipeline->ctx_cs.cdw);
4829
4830 cmd_buffer->state.emitted_compute_pipeline = pipeline;
4831
4832 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
4833 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
4834
4835 cmd_buffer->compute_scratch_size_per_wave_needed =
4836 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave);
4837 cmd_buffer->compute_scratch_waves_wanted =
4838 MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->max_waves);
4839
4840 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
4841 pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
4842
4843 if (unlikely(cmd_buffer->device->trace_bo))
4844 radv_save_pipeline(cmd_buffer, pipeline);
4845 }
4846
4847 static void
radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)4848 radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
4849 {
4850 struct radv_descriptor_state *descriptors_state =
4851 radv_get_descriptors_state(cmd_buffer, bind_point);
4852
4853 descriptors_state->dirty |= descriptors_state->valid;
4854 }
4855
4856 void
radv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)4857 radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
4858 VkPipeline _pipeline)
4859 {
4860 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4861 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
4862
4863 switch (pipelineBindPoint) {
4864 case VK_PIPELINE_BIND_POINT_COMPUTE:
4865 if (cmd_buffer->state.compute_pipeline == pipeline)
4866 return;
4867 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4868
4869 cmd_buffer->state.compute_pipeline = pipeline;
4870 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
4871 break;
4872 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
4873 if (cmd_buffer->state.rt_pipeline == pipeline)
4874 return;
4875 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4876
4877 cmd_buffer->state.rt_pipeline = pipeline;
4878 cmd_buffer->push_constant_stages |=
4879 (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
4880 VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR |
4881 VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR);
4882 radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size);
4883 break;
4884 case VK_PIPELINE_BIND_POINT_GRAPHICS:
4885 if (cmd_buffer->state.pipeline == pipeline)
4886 return;
4887 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4888
4889 bool vtx_emit_count_changed =
4890 !pipeline || !cmd_buffer->state.pipeline ||
4891 cmd_buffer->state.pipeline->graphics.vtx_emit_num != pipeline->graphics.vtx_emit_num ||
4892 cmd_buffer->state.pipeline->graphics.vtx_base_sgpr != pipeline->graphics.vtx_base_sgpr;
4893 cmd_buffer->state.pipeline = pipeline;
4894 if (!pipeline)
4895 break;
4896
4897 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
4898 cmd_buffer->push_constant_stages |= pipeline->active_stages;
4899
4900 /* the new vertex shader might not have the same user regs */
4901 if (vtx_emit_count_changed) {
4902 cmd_buffer->state.last_first_instance = -1;
4903 cmd_buffer->state.last_vertex_offset = -1;
4904 cmd_buffer->state.last_drawid = -1;
4905 }
4906
4907 /* Prefetch all pipeline shaders at first draw time. */
4908 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
4909
4910 if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug &&
4911 cmd_buffer->state.emitted_pipeline &&
4912 cmd_buffer->state.emitted_pipeline->graphics.is_ngg &&
4913 !cmd_buffer->state.pipeline->graphics.is_ngg) {
4914 /* Transitioning from NGG to legacy GS requires
4915 * VGT_FLUSH on GFX10 and Sienna Cichlid. VGT_FLUSH
4916 * is also emitted at the beginning of IBs when legacy
4917 * GS ring pointers are set.
4918 */
4919 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
4920 }
4921
4922 radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
4923 radv_bind_streamout_state(cmd_buffer, pipeline);
4924
4925 if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
4926 cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
4927 if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
4928 cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
4929
4930 if (radv_pipeline_has_tess(pipeline))
4931 cmd_buffer->tess_rings_needed = true;
4932 break;
4933 default:
4934 assert(!"invalid bind point");
4935 break;
4936 }
4937 }
4938
4939 void
radv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)4940 radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
4941 const VkViewport *pViewports)
4942 {
4943 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4944 struct radv_cmd_state *state = &cmd_buffer->state;
4945 ASSERTED const uint32_t total_count = firstViewport + viewportCount;
4946
4947 assert(firstViewport < MAX_VIEWPORTS);
4948 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
4949
4950 if (total_count <= state->dynamic.viewport.count &&
4951 !memcmp(state->dynamic.viewport.viewports + firstViewport, pViewports,
4952 viewportCount * sizeof(*pViewports))) {
4953 return;
4954 }
4955
4956 if (state->dynamic.viewport.count < total_count)
4957 state->dynamic.viewport.count = total_count;
4958
4959 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
4960 viewportCount * sizeof(*pViewports));
4961 for (unsigned i = 0; i < viewportCount; i++) {
4962 radv_get_viewport_xform(&pViewports[i],
4963 state->dynamic.viewport.xform[i + firstViewport].scale,
4964 state->dynamic.viewport.xform[i + firstViewport].translate);
4965 }
4966
4967 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
4968 }
4969
4970 void
radv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)4971 radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
4972 const VkRect2D *pScissors)
4973 {
4974 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4975 struct radv_cmd_state *state = &cmd_buffer->state;
4976 ASSERTED const uint32_t total_count = firstScissor + scissorCount;
4977
4978 assert(firstScissor < MAX_SCISSORS);
4979 assert(total_count >= 1 && total_count <= MAX_SCISSORS);
4980
4981 if (total_count <= state->dynamic.scissor.count &&
4982 !memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
4983 scissorCount * sizeof(*pScissors))) {
4984 return;
4985 }
4986
4987 if (state->dynamic.scissor.count < total_count)
4988 state->dynamic.scissor.count = total_count;
4989
4990 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
4991 scissorCount * sizeof(*pScissors));
4992
4993 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
4994 }
4995
4996 void
radv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)4997 radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
4998 {
4999 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5000
5001 if (cmd_buffer->state.dynamic.line_width == lineWidth)
5002 return;
5003
5004 cmd_buffer->state.dynamic.line_width = lineWidth;
5005 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
5006 }
5007
5008 void
radv_CmdSetDepthBias(VkCommandBuffer commandBuffer,float depthBiasConstantFactor,float depthBiasClamp,float depthBiasSlopeFactor)5009 radv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor,
5010 float depthBiasClamp, float depthBiasSlopeFactor)
5011 {
5012 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5013 struct radv_cmd_state *state = &cmd_buffer->state;
5014
5015 if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
5016 state->dynamic.depth_bias.clamp == depthBiasClamp &&
5017 state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
5018 return;
5019 }
5020
5021 state->dynamic.depth_bias.bias = depthBiasConstantFactor;
5022 state->dynamic.depth_bias.clamp = depthBiasClamp;
5023 state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
5024
5025 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
5026 }
5027
5028 void
radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])5029 radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
5030 {
5031 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5032 struct radv_cmd_state *state = &cmd_buffer->state;
5033
5034 if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4))
5035 return;
5036
5037 memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
5038
5039 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
5040 }
5041
5042 void
radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)5043 radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
5044 {
5045 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5046 struct radv_cmd_state *state = &cmd_buffer->state;
5047
5048 if (state->dynamic.depth_bounds.min == minDepthBounds &&
5049 state->dynamic.depth_bounds.max == maxDepthBounds) {
5050 return;
5051 }
5052
5053 state->dynamic.depth_bounds.min = minDepthBounds;
5054 state->dynamic.depth_bounds.max = maxDepthBounds;
5055
5056 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
5057 }
5058
5059 void
radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)5060 radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5061 uint32_t compareMask)
5062 {
5063 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5064 struct radv_cmd_state *state = &cmd_buffer->state;
5065 bool front_same = state->dynamic.stencil_compare_mask.front == compareMask;
5066 bool back_same = state->dynamic.stencil_compare_mask.back == compareMask;
5067
5068 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
5069 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
5070 return;
5071 }
5072
5073 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5074 state->dynamic.stencil_compare_mask.front = compareMask;
5075 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5076 state->dynamic.stencil_compare_mask.back = compareMask;
5077
5078 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
5079 }
5080
5081 void
radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)5082 radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5083 uint32_t writeMask)
5084 {
5085 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5086 struct radv_cmd_state *state = &cmd_buffer->state;
5087 bool front_same = state->dynamic.stencil_write_mask.front == writeMask;
5088 bool back_same = state->dynamic.stencil_write_mask.back == writeMask;
5089
5090 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
5091 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
5092 return;
5093 }
5094
5095 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5096 state->dynamic.stencil_write_mask.front = writeMask;
5097 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5098 state->dynamic.stencil_write_mask.back = writeMask;
5099
5100 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
5101 }
5102
5103 void
radv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)5104 radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5105 uint32_t reference)
5106 {
5107 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5108 struct radv_cmd_state *state = &cmd_buffer->state;
5109 bool front_same = state->dynamic.stencil_reference.front == reference;
5110 bool back_same = state->dynamic.stencil_reference.back == reference;
5111
5112 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
5113 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
5114 return;
5115 }
5116
5117 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5118 cmd_buffer->state.dynamic.stencil_reference.front = reference;
5119 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5120 cmd_buffer->state.dynamic.stencil_reference.back = reference;
5121
5122 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
5123 }
5124
5125 void
radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer,uint32_t firstDiscardRectangle,uint32_t discardRectangleCount,const VkRect2D * pDiscardRectangles)5126 radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
5127 uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
5128 {
5129 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5130 struct radv_cmd_state *state = &cmd_buffer->state;
5131 ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
5132
5133 assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
5134 assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
5135
5136 if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle,
5137 pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) {
5138 return;
5139 }
5140
5141 typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
5142 pDiscardRectangles, discardRectangleCount);
5143
5144 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
5145 }
5146
5147 void
radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,const VkSampleLocationsInfoEXT * pSampleLocationsInfo)5148 radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
5149 const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
5150 {
5151 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5152 struct radv_cmd_state *state = &cmd_buffer->state;
5153
5154 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
5155
5156 state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
5157 state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
5158 state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
5159 typed_memcpy(&state->dynamic.sample_location.locations[0],
5160 pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount);
5161
5162 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
5163 }
5164
5165 void
radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)5166 radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor,
5167 uint16_t lineStipplePattern)
5168 {
5169 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5170 struct radv_cmd_state *state = &cmd_buffer->state;
5171
5172 if (state->dynamic.line_stipple.factor == lineStippleFactor &&
5173 state->dynamic.line_stipple.pattern == lineStipplePattern)
5174 return;
5175
5176 state->dynamic.line_stipple.factor = lineStippleFactor;
5177 state->dynamic.line_stipple.pattern = lineStipplePattern;
5178
5179 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
5180 }
5181
5182 void
radv_CmdSetCullModeEXT(VkCommandBuffer commandBuffer,VkCullModeFlags cullMode)5183 radv_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
5184 {
5185 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5186 struct radv_cmd_state *state = &cmd_buffer->state;
5187
5188 if (state->dynamic.cull_mode == cullMode)
5189 return;
5190
5191 state->dynamic.cull_mode = cullMode;
5192
5193 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
5194 }
5195
5196 void
radv_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer,VkFrontFace frontFace)5197 radv_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
5198 {
5199 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5200 struct radv_cmd_state *state = &cmd_buffer->state;
5201
5202 if (state->dynamic.front_face == frontFace)
5203 return;
5204
5205 state->dynamic.front_face = frontFace;
5206
5207 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
5208 }
5209
5210 void
radv_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,VkPrimitiveTopology primitiveTopology)5211 radv_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,
5212 VkPrimitiveTopology primitiveTopology)
5213 {
5214 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5215 struct radv_cmd_state *state = &cmd_buffer->state;
5216 unsigned primitive_topology = si_translate_prim(primitiveTopology);
5217
5218 if (state->dynamic.primitive_topology == primitive_topology)
5219 return;
5220
5221 state->dynamic.primitive_topology = primitive_topology;
5222
5223 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
5224 }
5225
5226 void
radv_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)5227 radv_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer, uint32_t viewportCount,
5228 const VkViewport *pViewports)
5229 {
5230 radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
5231 }
5232
5233 void
radv_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer,uint32_t scissorCount,const VkRect2D * pScissors)5234 radv_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer, uint32_t scissorCount,
5235 const VkRect2D *pScissors)
5236 {
5237 radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
5238 }
5239
5240 void
radv_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthTestEnable)5241 radv_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
5242
5243 {
5244 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5245 struct radv_cmd_state *state = &cmd_buffer->state;
5246
5247 if (state->dynamic.depth_test_enable == depthTestEnable)
5248 return;
5249
5250 state->dynamic.depth_test_enable = depthTestEnable;
5251
5252 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
5253 }
5254
5255 void
radv_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthWriteEnable)5256 radv_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
5257 {
5258 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5259 struct radv_cmd_state *state = &cmd_buffer->state;
5260
5261 if (state->dynamic.depth_write_enable == depthWriteEnable)
5262 return;
5263
5264 state->dynamic.depth_write_enable = depthWriteEnable;
5265
5266 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
5267 }
5268
5269 void
radv_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer,VkCompareOp depthCompareOp)5270 radv_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
5271 {
5272 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5273 struct radv_cmd_state *state = &cmd_buffer->state;
5274
5275 if (state->dynamic.depth_compare_op == depthCompareOp)
5276 return;
5277
5278 state->dynamic.depth_compare_op = depthCompareOp;
5279
5280 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
5281 }
5282
5283 void
radv_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthBoundsTestEnable)5284 radv_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
5285 {
5286 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5287 struct radv_cmd_state *state = &cmd_buffer->state;
5288
5289 if (state->dynamic.depth_bounds_test_enable == depthBoundsTestEnable)
5290 return;
5291
5292 state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
5293
5294 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
5295 }
5296
5297 void
radv_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer,VkBool32 stencilTestEnable)5298 radv_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
5299 {
5300 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5301 struct radv_cmd_state *state = &cmd_buffer->state;
5302
5303 if (state->dynamic.stencil_test_enable == stencilTestEnable)
5304 return;
5305
5306 state->dynamic.stencil_test_enable = stencilTestEnable;
5307
5308 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
5309 }
5310
5311 void
radv_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,VkStencilOp failOp,VkStencilOp passOp,VkStencilOp depthFailOp,VkCompareOp compareOp)5312 radv_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5313 VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp,
5314 VkCompareOp compareOp)
5315 {
5316 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5317 struct radv_cmd_state *state = &cmd_buffer->state;
5318 bool front_same = state->dynamic.stencil_op.front.fail_op == failOp &&
5319 state->dynamic.stencil_op.front.pass_op == passOp &&
5320 state->dynamic.stencil_op.front.depth_fail_op == depthFailOp &&
5321 state->dynamic.stencil_op.front.compare_op == compareOp;
5322 bool back_same = state->dynamic.stencil_op.back.fail_op == failOp &&
5323 state->dynamic.stencil_op.back.pass_op == passOp &&
5324 state->dynamic.stencil_op.back.depth_fail_op == depthFailOp &&
5325 state->dynamic.stencil_op.back.compare_op == compareOp;
5326
5327 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
5328 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same))
5329 return;
5330
5331 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
5332 state->dynamic.stencil_op.front.fail_op = failOp;
5333 state->dynamic.stencil_op.front.pass_op = passOp;
5334 state->dynamic.stencil_op.front.depth_fail_op = depthFailOp;
5335 state->dynamic.stencil_op.front.compare_op = compareOp;
5336 }
5337
5338 if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
5339 state->dynamic.stencil_op.back.fail_op = failOp;
5340 state->dynamic.stencil_op.back.pass_op = passOp;
5341 state->dynamic.stencil_op.back.depth_fail_op = depthFailOp;
5342 state->dynamic.stencil_op.back.compare_op = compareOp;
5343 }
5344
5345 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
5346 }
5347
5348 void
radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer,const VkExtent2D * pFragmentSize,const VkFragmentShadingRateCombinerOpKHR combinerOps[2])5349 radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
5350 const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
5351 {
5352 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5353 struct radv_cmd_state *state = &cmd_buffer->state;
5354
5355 if (state->dynamic.fragment_shading_rate.size.width == pFragmentSize->width &&
5356 state->dynamic.fragment_shading_rate.size.height == pFragmentSize->height &&
5357 state->dynamic.fragment_shading_rate.combiner_ops[0] == combinerOps[0] &&
5358 state->dynamic.fragment_shading_rate.combiner_ops[1] == combinerOps[1])
5359 return;
5360
5361 state->dynamic.fragment_shading_rate.size = *pFragmentSize;
5362 for (unsigned i = 0; i < 2; i++)
5363 state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i];
5364
5365 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
5366 }
5367
5368 void
radv_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthBiasEnable)5369 radv_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
5370 {
5371 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5372 struct radv_cmd_state *state = &cmd_buffer->state;
5373
5374 if (state->dynamic.depth_bias_enable == depthBiasEnable)
5375 return;
5376
5377 state->dynamic.depth_bias_enable = depthBiasEnable;
5378
5379 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
5380 }
5381
5382 void
radv_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer,VkBool32 primitiveRestartEnable)5383 radv_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
5384 {
5385 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5386 struct radv_cmd_state *state = &cmd_buffer->state;
5387
5388 if (state->dynamic.primitive_restart_enable == primitiveRestartEnable)
5389 return;
5390
5391 state->dynamic.primitive_restart_enable = primitiveRestartEnable;
5392
5393 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
5394 }
5395
5396 void
radv_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer,VkBool32 rasterizerDiscardEnable)5397 radv_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer,
5398 VkBool32 rasterizerDiscardEnable)
5399 {
5400 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5401 struct radv_cmd_state *state = &cmd_buffer->state;
5402
5403 if (state->dynamic.rasterizer_discard_enable == rasterizerDiscardEnable)
5404 return;
5405
5406 state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable;
5407
5408 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
5409 }
5410
5411 void
radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,uint32_t patchControlPoints)5412 radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
5413 {
5414 /* not implemented */
5415 }
5416
5417 void
radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,VkLogicOp logicOp)5418 radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
5419 {
5420 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5421 struct radv_cmd_state *state = &cmd_buffer->state;
5422 unsigned logic_op = si_translate_blend_logic_op(logicOp);
5423
5424 if (state->dynamic.logic_op == logic_op)
5425 return;
5426
5427 state->dynamic.logic_op = logic_op;
5428
5429 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
5430 }
5431
5432 void
radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)5433 radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
5434 const VkBool32 *pColorWriteEnables)
5435 {
5436 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5437 struct radv_cmd_state *state = &cmd_buffer->state;
5438 uint32_t color_write_enable = 0;
5439
5440 assert(attachmentCount < MAX_RTS);
5441
5442 for (uint32_t i = 0; i < attachmentCount; i++) {
5443 color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
5444 }
5445
5446 if (state->dynamic.color_write_enable == color_write_enable)
5447 return;
5448
5449 state->dynamic.color_write_enable = color_write_enable;
5450
5451 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
5452 }
5453
5454 void
radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer,uint32_t vertexBindingDescriptionCount,const VkVertexInputBindingDescription2EXT * pVertexBindingDescriptions,uint32_t vertexAttributeDescriptionCount,const VkVertexInputAttributeDescription2EXT * pVertexAttributeDescriptions)5455 radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
5456 const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
5457 uint32_t vertexAttributeDescriptionCount,
5458 const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
5459 {
5460 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5461 struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
5462
5463 const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
5464 for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
5465 bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
5466
5467 cmd_buffer->state.vbo_misaligned_mask = 0;
5468
5469 memset(state, 0, sizeof(*state));
5470
5471 enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
5472 for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
5473 const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
5474 const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
5475 unsigned loc = attrib->location;
5476 const struct util_format_description *format_desc = vk_format_description(attrib->format);
5477 unsigned nfmt, dfmt;
5478 bool post_shuffle;
5479 enum radv_vs_input_alpha_adjust alpha_adjust;
5480
5481 state->attribute_mask |= 1u << loc;
5482 state->bindings[loc] = attrib->binding;
5483 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
5484 state->instance_rate_inputs |= 1u << loc;
5485 state->divisors[loc] = binding->divisor;
5486 if (binding->divisor != 1)
5487 state->nontrivial_divisors |= 1u << loc;
5488 }
5489 cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
5490 state->offsets[loc] = attrib->offset;
5491
5492 radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc,
5493 &dfmt, &nfmt, &post_shuffle, &alpha_adjust);
5494
5495 state->formats[loc] = dfmt | (nfmt << 4);
5496 const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 :
5497 (format_desc->block.bits / 8u - 1);
5498 state->format_align_req_minus_1[loc] = format_align_req_minus_1;
5499 state->format_sizes[loc] = format_desc->block.bits / 8u;
5500
5501 if (chip == GFX6 || chip >= GFX10) {
5502 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
5503 unsigned bit = 1u << loc;
5504 if (binding->stride & format_align_req_minus_1) {
5505 state->misaligned_mask |= bit;
5506 if (cmd_buffer->state.vbo_bound_mask & bit)
5507 cmd_buffer->state.vbo_misaligned_mask |= bit;
5508 } else {
5509 state->possibly_misaligned_mask |= bit;
5510 if (cmd_buffer->state.vbo_bound_mask & bit &&
5511 ((vb[attrib->binding].offset + state->offsets[loc]) & format_align_req_minus_1))
5512 cmd_buffer->state.vbo_misaligned_mask |= bit;
5513 }
5514 }
5515
5516 if (alpha_adjust) {
5517 state->alpha_adjust_lo |= (alpha_adjust & 0x1) << loc;
5518 state->alpha_adjust_hi |= (alpha_adjust >> 1) << loc;
5519 }
5520
5521 if (post_shuffle)
5522 state->post_shuffle |= 1u << loc;
5523 }
5524
5525 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
5526 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
5527 }
5528
5529 void
radv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCmdBuffers)5530 radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount,
5531 const VkCommandBuffer *pCmdBuffers)
5532 {
5533 RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
5534
5535 assert(commandBufferCount > 0);
5536
5537 radv_emit_mip_change_flush_default(primary);
5538
5539 /* Emit pending flushes on primary prior to executing secondary */
5540 si_emit_cache_flush(primary);
5541
5542 /* Make sure CP DMA is idle on primary prior to executing secondary. */
5543 si_cp_dma_wait_for_idle(primary);
5544
5545 for (uint32_t i = 0; i < commandBufferCount; i++) {
5546 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
5547 bool allow_ib2 = true;
5548
5549 if (secondary->device->physical_device->rad_info.chip_class == GFX7 &&
5550 secondary->state.uses_draw_indirect_multi) {
5551 /* Do not launch an IB2 for secondary command buffers that contain
5552 * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU.
5553 */
5554 allow_ib2 = false;
5555 }
5556
5557 primary->scratch_size_per_wave_needed =
5558 MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
5559 primary->scratch_waves_wanted =
5560 MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
5561 primary->compute_scratch_size_per_wave_needed =
5562 MAX2(primary->compute_scratch_size_per_wave_needed,
5563 secondary->compute_scratch_size_per_wave_needed);
5564 primary->compute_scratch_waves_wanted =
5565 MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
5566
5567 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
5568 primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
5569 if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
5570 primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
5571 if (secondary->tess_rings_needed)
5572 primary->tess_rings_needed = true;
5573 if (secondary->sample_positions_needed)
5574 primary->sample_positions_needed = true;
5575 if (secondary->gds_needed)
5576 primary->gds_needed = true;
5577
5578 if (!secondary->state.framebuffer && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
5579 /* Emit the framebuffer state from primary if secondary
5580 * has been recorded without a framebuffer, otherwise
5581 * fast color/depth clears can't work.
5582 */
5583 radv_emit_fb_mip_change_flush(primary);
5584 radv_emit_framebuffer_state(primary);
5585 }
5586
5587 primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
5588
5589 /* When the secondary command buffer is compute only we don't
5590 * need to re-emit the current graphics pipeline.
5591 */
5592 if (secondary->state.emitted_pipeline) {
5593 primary->state.emitted_pipeline = secondary->state.emitted_pipeline;
5594 }
5595
5596 /* When the secondary command buffer is graphics only we don't
5597 * need to re-emit the current compute pipeline.
5598 */
5599 if (secondary->state.emitted_compute_pipeline) {
5600 primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
5601 }
5602
5603 /* Only re-emit the draw packets when needed. */
5604 if (secondary->state.last_primitive_reset_en != -1) {
5605 primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en;
5606 }
5607
5608 if (secondary->state.last_primitive_reset_index) {
5609 primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
5610 }
5611
5612 if (secondary->state.last_ia_multi_vgt_param) {
5613 primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
5614 }
5615
5616 primary->state.last_first_instance = secondary->state.last_first_instance;
5617 primary->state.last_num_instances = secondary->state.last_num_instances;
5618 primary->state.last_drawid = secondary->state.last_drawid;
5619 primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
5620 primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
5621 primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
5622 primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
5623
5624 if (secondary->state.last_index_type != -1) {
5625 primary->state.last_index_type = secondary->state.last_index_type;
5626 }
5627
5628 primary->state.last_nggc_settings = secondary->state.last_nggc_settings;
5629 primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx;
5630 primary->state.last_nggc_skip = secondary->state.last_nggc_skip;
5631 }
5632
5633 /* After executing commands from secondary buffers we have to dirty
5634 * some states.
5635 */
5636 primary->state.dirty |=
5637 RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_ALL;
5638 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
5639 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
5640 }
5641
5642 VkResult
radv_CreateCommandPool(VkDevice _device,const VkCommandPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkCommandPool * pCmdPool)5643 radv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo,
5644 const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool)
5645 {
5646 RADV_FROM_HANDLE(radv_device, device, _device);
5647 struct radv_cmd_pool *pool;
5648
5649 pool =
5650 vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
5651 if (pool == NULL)
5652 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
5653
5654 vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_COMMAND_POOL);
5655
5656 if (pAllocator)
5657 pool->alloc = *pAllocator;
5658 else
5659 pool->alloc = device->vk.alloc;
5660
5661 list_inithead(&pool->cmd_buffers);
5662 list_inithead(&pool->free_cmd_buffers);
5663
5664 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
5665
5666 *pCmdPool = radv_cmd_pool_to_handle(pool);
5667
5668 return VK_SUCCESS;
5669 }
5670
5671 void
radv_DestroyCommandPool(VkDevice _device,VkCommandPool commandPool,const VkAllocationCallbacks * pAllocator)5672 radv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool,
5673 const VkAllocationCallbacks *pAllocator)
5674 {
5675 RADV_FROM_HANDLE(radv_device, device, _device);
5676 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
5677
5678 if (!pool)
5679 return;
5680
5681 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
5682 {
5683 radv_destroy_cmd_buffer(cmd_buffer);
5684 }
5685
5686 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
5687 {
5688 radv_destroy_cmd_buffer(cmd_buffer);
5689 }
5690
5691 vk_object_base_finish(&pool->base);
5692 vk_free2(&device->vk.alloc, pAllocator, pool);
5693 }
5694
5695 VkResult
radv_ResetCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolResetFlags flags)5696 radv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags)
5697 {
5698 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
5699 VkResult result;
5700
5701 list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
5702 {
5703 result = radv_reset_cmd_buffer(cmd_buffer);
5704 if (result != VK_SUCCESS)
5705 return result;
5706 }
5707
5708 return VK_SUCCESS;
5709 }
5710
5711 void
radv_TrimCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolTrimFlags flags)5712 radv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags)
5713 {
5714 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
5715
5716 if (!pool)
5717 return;
5718
5719 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
5720 {
5721 radv_destroy_cmd_buffer(cmd_buffer);
5722 }
5723 }
5724
5725 static void
radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer * cmd_buffer,uint32_t subpass_id)5726 radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id)
5727 {
5728 struct radv_cmd_state *state = &cmd_buffer->state;
5729 struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
5730
5731 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096);
5732
5733 radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier);
5734
5735 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
5736
5737 radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
5738
5739 for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
5740 const uint32_t a = subpass->attachments[i].attachment;
5741 if (a == VK_ATTACHMENT_UNUSED)
5742 continue;
5743
5744 radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true);
5745 }
5746
5747 if (subpass->vrs_attachment) {
5748 int idx = subpass->vrs_attachment->attachment;
5749 struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview;
5750
5751 if (subpass->depth_stencil_attachment) {
5752 /* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to
5753 * copy the VRS rates to the HTILE buffer of the attachment.
5754 */
5755 int ds_idx = subpass->depth_stencil_attachment->attachment;
5756 struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview;
5757 struct radv_image *ds_image = ds_iview->image;
5758
5759 VkExtent2D extent = {
5760 .width = ds_image->info.width,
5761 .height = ds_image->info.height,
5762 };
5763
5764 /* HTILE buffer */
5765 uint64_t htile_offset = ds_image->offset + ds_image->planes[0].surface.meta_offset;
5766 uint64_t htile_size = ds_image->planes[0].surface.meta_slice_size;
5767 struct radv_buffer htile_buffer;
5768
5769 radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bo, htile_size, htile_offset);
5770
5771 /* Copy the VRS rates to the HTILE buffer. */
5772 radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true);
5773
5774 radv_buffer_finish(&htile_buffer);
5775 } else {
5776 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have
5777 * to copy the VRS rates to our internal HTILE buffer.
5778 */
5779 struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
5780 struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
5781
5782 if (ds_image) {
5783 /* HTILE buffer */
5784 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
5785
5786 VkExtent2D extent = {
5787 .width = MIN2(fb->width, ds_image->info.width),
5788 .height = MIN2(fb->height, ds_image->info.height),
5789 };
5790
5791 /* Copy the VRS rates to the HTILE buffer. */
5792 radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false);
5793 }
5794 }
5795 }
5796
5797 radv_describe_barrier_end(cmd_buffer);
5798
5799 radv_cmd_buffer_clear_subpass(cmd_buffer);
5800
5801 assert(cmd_buffer->cs->cdw <= cdw_max);
5802 }
5803
5804 static void
radv_mark_noncoherent_rb(struct radv_cmd_buffer * cmd_buffer)5805 radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
5806 {
5807 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
5808
5809 /* Have to be conservative in cmdbuffers with inherited attachments. */
5810 if (!cmd_buffer->state.attachments) {
5811 cmd_buffer->state.rb_noncoherent_dirty = true;
5812 return;
5813 }
5814
5815 for (uint32_t i = 0; i < subpass->color_count; ++i) {
5816 const uint32_t a = subpass->color_attachments[i].attachment;
5817 if (a == VK_ATTACHMENT_UNUSED)
5818 continue;
5819 if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
5820 cmd_buffer->state.rb_noncoherent_dirty = true;
5821 return;
5822 }
5823 }
5824 if (subpass->depth_stencil_attachment &&
5825 !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
5826 .iview->image->l2_coherent)
5827 cmd_buffer->state.rb_noncoherent_dirty = true;
5828 }
5829
5830 void
radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass * subpass)5831 radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
5832 const struct radv_subpass *subpass)
5833 {
5834 radv_mark_noncoherent_rb(cmd_buffer);
5835 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
5836 }
5837
5838 static void
radv_cmd_buffer_end_subpass(struct radv_cmd_buffer * cmd_buffer)5839 radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
5840 {
5841 struct radv_cmd_state *state = &cmd_buffer->state;
5842 const struct radv_subpass *subpass = state->subpass;
5843 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
5844
5845 radv_cmd_buffer_resolve_subpass(cmd_buffer);
5846
5847 radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
5848
5849 for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
5850 const uint32_t a = subpass->attachments[i].attachment;
5851 if (a == VK_ATTACHMENT_UNUSED)
5852 continue;
5853
5854 if (state->pass->attachments[a].last_subpass_idx != subpass_id)
5855 continue;
5856
5857 VkImageLayout layout = state->pass->attachments[a].final_layout;
5858 VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout;
5859 struct radv_subpass_attachment att = {a, layout, stencil_layout};
5860 radv_handle_subpass_image_transition(cmd_buffer, att, false);
5861 }
5862
5863 radv_describe_barrier_end(cmd_buffer);
5864 }
5865
5866 void
radv_cmd_buffer_begin_render_pass(struct radv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin,const struct radv_extra_render_pass_begin_info * extra_info)5867 radv_cmd_buffer_begin_render_pass(struct radv_cmd_buffer *cmd_buffer,
5868 const VkRenderPassBeginInfo *pRenderPassBegin,
5869 const struct radv_extra_render_pass_begin_info *extra_info)
5870 {
5871 RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
5872 RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
5873 VkResult result;
5874
5875 cmd_buffer->state.framebuffer = framebuffer;
5876 cmd_buffer->state.pass = pass;
5877 cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
5878
5879 result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin, extra_info);
5880 if (result != VK_SUCCESS)
5881 return;
5882
5883 result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBegin);
5884 if (result != VK_SUCCESS)
5885 return;
5886 }
5887
5888 void
radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)5889 radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
5890 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
5891 const VkSubpassBeginInfo *pSubpassBeginInfo)
5892 {
5893 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5894
5895 radv_cmd_buffer_begin_render_pass(cmd_buffer, pRenderPassBeginInfo, NULL);
5896
5897 radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
5898 }
5899
5900 void
radv_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)5901 radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo,
5902 const VkSubpassEndInfo *pSubpassEndInfo)
5903 {
5904 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5905
5906 radv_mark_noncoherent_rb(cmd_buffer);
5907
5908 uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
5909 radv_cmd_buffer_end_subpass(cmd_buffer);
5910 radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
5911 }
5912
5913 static void
radv_emit_view_index(struct radv_cmd_buffer * cmd_buffer,unsigned index)5914 radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
5915 {
5916 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
5917 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
5918 if (!radv_get_shader(pipeline, stage))
5919 continue;
5920
5921 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX);
5922 if (loc->sgpr_idx == -1)
5923 continue;
5924 uint32_t base_reg = pipeline->user_data_0[stage];
5925 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
5926 }
5927 if (radv_pipeline_has_gs_copy_shader(pipeline)) {
5928 struct radv_userdata_info *loc =
5929 &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
5930 if (loc->sgpr_idx != -1) {
5931 uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
5932 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
5933 }
5934 }
5935 }
5936
5937 static void
radv_cs_emit_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_count,uint32_t use_opaque)5938 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count,
5939 uint32_t use_opaque)
5940 {
5941 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
5942 radeon_emit(cmd_buffer->cs, vertex_count);
5943 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
5944 }
5945
5946 /**
5947 * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
5948 *
5949 * The starting address "index_va" may point anywhere within the index buffer. The number of
5950 * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
5951 * Hardware uses this information to return 0 for out-of-bounds reads.
5952 */
5953 static void
radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t index_va,uint32_t max_index_count,uint32_t index_count,bool not_eop)5954 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va,
5955 uint32_t max_index_count, uint32_t index_count, bool not_eop)
5956 {
5957 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
5958 radeon_emit(cmd_buffer->cs, max_index_count);
5959 radeon_emit(cmd_buffer->cs, index_va);
5960 radeon_emit(cmd_buffer->cs, index_va >> 32);
5961 radeon_emit(cmd_buffer->cs, index_count);
5962 /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
5963 * can be changed between draws and GS fast launch must be disabled.
5964 * NOT_EOP doesn't work on gfx9 and older.
5965 */
5966 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
5967 }
5968
5969 /* MUST inline this function to avoid massive perf loss in drawoverhead */
5970 ALWAYS_INLINE static void
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer * cmd_buffer,bool indexed,uint32_t draw_count,uint64_t count_va,uint32_t stride)5971 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed,
5972 uint32_t draw_count, uint64_t count_va, uint32_t stride)
5973 {
5974 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5975 const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
5976 bool draw_id_enable = cmd_buffer->state.pipeline->graphics.uses_drawid;
5977 uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
5978 uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
5979 bool predicating = cmd_buffer->state.predicating;
5980 assert(base_reg);
5981
5982 /* just reset draw state for vertex data */
5983 cmd_buffer->state.last_first_instance = -1;
5984 cmd_buffer->state.last_num_instances = -1;
5985 cmd_buffer->state.last_drawid = -1;
5986 cmd_buffer->state.last_vertex_offset = -1;
5987
5988 vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
5989 if (cmd_buffer->state.pipeline->graphics.uses_baseinstance)
5990 start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
5991 if (draw_id_enable)
5992 draw_id_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
5993
5994 if (draw_count == 1 && !count_va && !draw_id_enable) {
5995 radeon_emit(cs,
5996 PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
5997 radeon_emit(cs, 0);
5998 radeon_emit(cs, vertex_offset_reg);
5999 radeon_emit(cs, start_instance_reg);
6000 radeon_emit(cs, di_src_sel);
6001 } else {
6002 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
6003 predicating));
6004 radeon_emit(cs, 0);
6005 radeon_emit(cs, vertex_offset_reg);
6006 radeon_emit(cs, start_instance_reg);
6007 radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
6008 S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
6009 radeon_emit(cs, draw_count); /* count */
6010 radeon_emit(cs, count_va); /* count_addr */
6011 radeon_emit(cs, count_va >> 32);
6012 radeon_emit(cs, stride); /* stride */
6013 radeon_emit(cs, di_src_sel);
6014
6015 cmd_buffer->state.uses_draw_indirect_multi = true;
6016 }
6017 }
6018
6019 static inline void
radv_emit_userdata_vertex_internal(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)6020 radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer,
6021 const struct radv_draw_info *info, const uint32_t vertex_offset)
6022 {
6023 struct radv_cmd_state *state = &cmd_buffer->state;
6024 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6025 const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance;
6026 const bool uses_drawid = state->pipeline->graphics.uses_drawid;
6027 radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
6028 state->pipeline->graphics.vtx_emit_num);
6029
6030 radeon_emit(cs, vertex_offset);
6031 state->last_vertex_offset = vertex_offset;
6032 if (uses_drawid) {
6033 radeon_emit(cs, 0);
6034 state->last_drawid = 0;
6035 }
6036 if (uses_baseinstance) {
6037 radeon_emit(cs, info->first_instance);
6038 state->last_first_instance = info->first_instance;
6039 }
6040 }
6041
6042 ALWAYS_INLINE static void
radv_emit_userdata_vertex(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)6043 radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6044 const uint32_t vertex_offset)
6045 {
6046 const struct radv_cmd_state *state = &cmd_buffer->state;
6047 const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance;
6048 const bool uses_drawid = state->pipeline->graphics.uses_drawid;
6049
6050 /* this looks very dumb, but it allows the compiler to optimize better and yields
6051 * ~3-4% perf increase in drawoverhead
6052 */
6053 if (vertex_offset != state->last_vertex_offset) {
6054 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6055 } else if (uses_drawid && 0 != state->last_drawid) {
6056 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6057 } else if (uses_baseinstance && info->first_instance != state->last_first_instance) {
6058 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6059 }
6060 }
6061
6062 ALWAYS_INLINE static void
radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_offset,uint32_t drawid)6063 radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
6064 {
6065 struct radv_cmd_state *state = &cmd_buffer->state;
6066 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6067 radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr, 1 + !!drawid);
6068 radeon_emit(cs, vertex_offset);
6069 state->last_vertex_offset = vertex_offset;
6070 if (drawid)
6071 radeon_emit(cs, drawid);
6072
6073 }
6074
6075 ALWAYS_INLINE static void
radv_emit_draw_packets_indexed(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * minfo,uint32_t stride,const int32_t * vertexOffset)6076 radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer,
6077 const struct radv_draw_info *info,
6078 uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo,
6079 uint32_t stride,
6080 const int32_t *vertexOffset)
6081
6082 {
6083 struct radv_cmd_state *state = &cmd_buffer->state;
6084 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6085 const int index_size = radv_get_vgt_index_size(state->index_type);
6086 unsigned i = 0;
6087 const bool uses_drawid = state->pipeline->graphics.uses_drawid;
6088 const bool can_eop = !uses_drawid && cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10;
6089
6090 if (uses_drawid) {
6091 if (vertexOffset) {
6092 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
6093 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6094 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6095
6096 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6097 if (!remaining_indexes &&
6098 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6099 continue;
6100
6101 if (i > 0)
6102 radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i);
6103
6104 const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6105
6106 if (!state->subpass->view_mask) {
6107 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6108 } else {
6109 u_foreach_bit(view, state->subpass->view_mask) {
6110 radv_emit_view_index(cmd_buffer, view);
6111
6112 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6113 }
6114 }
6115 }
6116 } else {
6117 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6118 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6119
6120 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6121 if (!remaining_indexes &&
6122 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6123 continue;
6124
6125 if (i > 0) {
6126 if (state->last_vertex_offset != draw->vertexOffset)
6127 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
6128 else
6129 radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i);
6130 } else
6131 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
6132
6133 const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6134
6135 if (!state->subpass->view_mask) {
6136 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6137 } else {
6138 u_foreach_bit(view, state->subpass->view_mask) {
6139 radv_emit_view_index(cmd_buffer, view);
6140
6141 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6142 }
6143 }
6144 }
6145 }
6146 if (drawCount > 1) {
6147 state->last_drawid = drawCount - 1;
6148 }
6149 } else {
6150 if (vertexOffset) {
6151 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX10) {
6152 /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
6153 * count == 0 for the last draw that doesn't have NOT_EOP.
6154 */
6155 while (drawCount > 1) {
6156 const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride);
6157 if (last->indexCount)
6158 break;
6159 drawCount--;
6160 }
6161 }
6162
6163 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
6164 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6165 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6166
6167 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6168 if (!remaining_indexes &&
6169 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6170 continue;
6171
6172 const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6173
6174 if (!state->subpass->view_mask) {
6175 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1);
6176 } else {
6177 u_foreach_bit(view, state->subpass->view_mask) {
6178 radv_emit_view_index(cmd_buffer, view);
6179
6180 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6181 }
6182 }
6183 }
6184 } else {
6185 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6186 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6187
6188 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6189 if (!remaining_indexes &&
6190 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6191 continue;
6192
6193 const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL);
6194 const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
6195 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
6196
6197 const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6198
6199 if (!state->subpass->view_mask) {
6200 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1);
6201 } else {
6202 u_foreach_bit(view, state->subpass->view_mask) {
6203 radv_emit_view_index(cmd_buffer, view);
6204
6205 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6206 }
6207 }
6208 }
6209 }
6210 if (drawCount > 1) {
6211 state->last_drawid = drawCount - 1;
6212 }
6213 }
6214 }
6215
6216 ALWAYS_INLINE static void
radv_emit_direct_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawInfoEXT * minfo,uint32_t use_opaque,uint32_t stride)6217 radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6218 uint32_t drawCount, const VkMultiDrawInfoEXT *minfo,
6219 uint32_t use_opaque, uint32_t stride)
6220 {
6221 unsigned i = 0;
6222 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
6223 const bool uses_drawid = cmd_buffer->state.pipeline->graphics.uses_drawid;
6224 uint32_t last_start = 0;
6225
6226 vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) {
6227 if (!i)
6228 radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
6229 else
6230 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
6231
6232 if (!view_mask) {
6233 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
6234 } else {
6235 u_foreach_bit(view, view_mask) {
6236 radv_emit_view_index(cmd_buffer, view);
6237 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
6238 }
6239 }
6240 last_start = draw->firstVertex;
6241 }
6242 if (drawCount > 1) {
6243 struct radv_cmd_state *state = &cmd_buffer->state;
6244 state->last_vertex_offset = last_start;
6245 if (uses_drawid)
6246 state->last_drawid = drawCount - 1;
6247 }
6248 }
6249
6250 static void
radv_emit_indirect_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)6251 radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer,
6252 const struct radv_draw_info *info)
6253 {
6254 const struct radv_cmd_state *state = &cmd_buffer->state;
6255 struct radeon_winsys *ws = cmd_buffer->device->ws;
6256 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6257 const uint64_t va =
6258 radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
6259 const uint64_t count_va = info->count_buffer
6260 ? radv_buffer_get_va(info->count_buffer->bo) +
6261 info->count_buffer->offset + info->count_buffer_offset
6262 : 0;
6263
6264 radv_cs_add_buffer(ws, cs, info->indirect->bo);
6265
6266 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
6267 radeon_emit(cs, 1);
6268 radeon_emit(cs, va);
6269 radeon_emit(cs, va >> 32);
6270
6271 if (info->count_buffer) {
6272 radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
6273 }
6274
6275 if (!state->subpass->view_mask) {
6276 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
6277 info->stride);
6278 } else {
6279 u_foreach_bit(i, state->subpass->view_mask)
6280 {
6281 radv_emit_view_index(cmd_buffer, i);
6282
6283 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
6284 info->stride);
6285 }
6286 }
6287 }
6288
6289 /*
6290 * Vega and raven have a bug which triggers if there are multiple context
6291 * register contexts active at the same time with different scissor values.
6292 *
6293 * There are two possible workarounds:
6294 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
6295 * there is only ever 1 active set of scissor values at the same time.
6296 *
6297 * 2) Whenever the hardware switches contexts we have to set the scissor
6298 * registers again even if it is a noop. That way the new context gets
6299 * the correct scissor values.
6300 *
6301 * This implements option 2. radv_need_late_scissor_emission needs to
6302 * return true on affected HW if radv_emit_all_graphics_states sets
6303 * any context registers.
6304 */
6305 static bool
radv_need_late_scissor_emission(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)6306 radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
6307 const struct radv_draw_info *info)
6308 {
6309 struct radv_cmd_state *state = &cmd_buffer->state;
6310
6311 if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
6312 return false;
6313
6314 if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
6315 return true;
6316
6317 uint64_t used_states =
6318 cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
6319
6320 /* Index, vertex and streamout buffers don't change context regs, and
6321 * pipeline is already handled.
6322 */
6323 used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
6324 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT | RADV_CMD_DIRTY_STREAMOUT_BUFFER |
6325 RADV_CMD_DIRTY_PIPELINE);
6326
6327 if (cmd_buffer->state.dirty & used_states)
6328 return true;
6329
6330 uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
6331
6332 if (info->indexed && state->dynamic.primitive_restart_enable &&
6333 primitive_reset_index != state->last_primitive_reset_index)
6334 return true;
6335
6336 return false;
6337 }
6338
6339 enum {
6340 ngg_cull_none = 0,
6341 ngg_cull_front_face = 1,
6342 ngg_cull_back_face = 2,
6343 ngg_cull_face_is_ccw = 4,
6344 ngg_cull_small_primitives = 8,
6345 };
6346
6347 ALWAYS_INLINE static bool
radv_skip_ngg_culling(bool has_tess,const unsigned vtx_cnt,bool indirect)6348 radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
6349 bool indirect)
6350 {
6351 /* If we have to draw only a few vertices, we get better latency if
6352 * we disable NGG culling.
6353 *
6354 * When tessellation is used, what matters is the number of tessellated
6355 * vertices, so let's always assume it's not a small draw.
6356 */
6357 return !has_tess && !indirect && vtx_cnt < 128;
6358 }
6359
6360 ALWAYS_INLINE static uint32_t
radv_get_ngg_culling_settings(struct radv_cmd_buffer * cmd_buffer,bool vp_y_inverted)6361 radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
6362 {
6363 const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
6364 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
6365
6366 /* Cull every triangle when rasterizer discard is enabled. */
6367 if (d->rasterizer_discard_enable ||
6368 G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl))
6369 return ngg_cull_front_face | ngg_cull_back_face;
6370
6371 uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
6372 uint32_t nggc_settings = ngg_cull_none;
6373
6374 /* The culling code needs to know whether face is CW or CCW. */
6375 bool ccw = (pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE)
6376 ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE
6377 : G_028814_FACE(pa_su_sc_mode_cntl) == 0;
6378
6379 /* Take inverted viewport into account. */
6380 ccw ^= vp_y_inverted;
6381
6382 if (ccw)
6383 nggc_settings |= ngg_cull_face_is_ccw;
6384
6385 /* Face culling settings. */
6386 if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
6387 ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT)
6388 : G_028814_CULL_FRONT(pa_su_sc_mode_cntl))
6389 nggc_settings |= ngg_cull_front_face;
6390 if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
6391 ? (d->cull_mode & VK_CULL_MODE_BACK_BIT)
6392 : G_028814_CULL_BACK(pa_su_sc_mode_cntl))
6393 nggc_settings |= ngg_cull_back_face;
6394
6395 /* Small primitive culling is only valid when conservative overestimation is not used. */
6396 if (!pipeline->graphics.uses_conservative_overestimate) {
6397 nggc_settings |= ngg_cull_small_primitives;
6398
6399 /* small_prim_precision = num_samples / 2^subpixel_bits
6400 * num_samples is also always a power of two, so the small prim precision can only be
6401 * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
6402 */
6403 unsigned subpixel_bits = 256;
6404 int32_t small_prim_precision_log2 = util_logbase2(pipeline->graphics.ms.num_samples) - util_logbase2(subpixel_bits);
6405 nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u);
6406 }
6407
6408 return nggc_settings;
6409 }
6410
6411 static void
radv_emit_ngg_culling_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)6412 radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
6413 {
6414 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
6415 const unsigned stage = pipeline->graphics.last_vgt_api_stage;
6416 const bool nggc_supported = pipeline->graphics.has_ngg_culling;
6417
6418 if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) {
6419 /* Current shader doesn't support culling and culling was already disabled:
6420 * No further steps needed, just remember the SGPR's location is not set.
6421 */
6422 cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
6423 return;
6424 }
6425
6426 /* Check dirty flags:
6427 * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed).
6428 * - Dirty dynamic flags: culling settings may have changed.
6429 */
6430 const bool dirty =
6431 cmd_buffer->state.dirty &
6432 (RADV_CMD_DIRTY_PIPELINE |
6433 RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
6434 RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT);
6435
6436 /* Check small draw status:
6437 * For small draw calls, we disable culling by setting the SGPR to 0.
6438 */
6439 const bool skip =
6440 radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect);
6441
6442 /* See if anything changed. */
6443 if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
6444 return;
6445
6446 /* Remember small draw state. */
6447 cmd_buffer->state.last_nggc_skip = skip;
6448 const struct radv_shader_variant *v = pipeline->shaders[stage];
6449 assert(v->info.has_ngg_culling == nggc_supported);
6450
6451 /* Find the user SGPR. */
6452 const uint32_t base_reg = pipeline->user_data_0[stage];
6453 const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx;
6454 assert(!nggc_supported || nggc_sgpr_idx != -1);
6455
6456 /* Get viewport transform. */
6457 float vp_scale[2], vp_translate[2];
6458 memcpy(vp_scale, cmd_buffer->state.dynamic.viewport.xform[0].scale, 2 * sizeof(float));
6459 memcpy(vp_translate, cmd_buffer->state.dynamic.viewport.xform[0].translate, 2 * sizeof(float));
6460 bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
6461
6462 /* Get current culling settings. */
6463 uint32_t nggc_settings = nggc_supported && !skip
6464 ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted)
6465 : ngg_cull_none;
6466
6467 bool emit_viewport = nggc_settings &&
6468 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT ||
6469 cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx ||
6470 !cmd_buffer->state.last_nggc_settings);
6471
6472 if (emit_viewport) {
6473 /* Correction for inverted Y */
6474 if (vp_y_inverted) {
6475 vp_scale[1] = -vp_scale[1];
6476 vp_translate[1] = -vp_translate[1];
6477 }
6478
6479 /* Correction for number of samples per pixel. */
6480 for (unsigned i = 0; i < 2; ++i) {
6481 vp_scale[i] *= (float) pipeline->graphics.ms.num_samples;
6482 vp_translate[i] *= (float) pipeline->graphics.ms.num_samples;
6483 }
6484
6485 uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
6486 const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx;
6487 assert(vp_sgpr_idx != -1);
6488 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
6489 radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
6490 }
6491
6492 bool emit_settings = nggc_supported &&
6493 (cmd_buffer->state.last_nggc_settings != nggc_settings ||
6494 cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx);
6495
6496 /* This needs to be emitted when culling is turned on
6497 * and when it's already on but some settings change.
6498 */
6499 if (emit_settings) {
6500 assert(nggc_sgpr_idx >= 0);
6501 radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
6502 }
6503
6504 /* These only need to be emitted when culling is turned on or off,
6505 * but not when it stays on and just some settings change.
6506 */
6507 if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) {
6508 uint32_t rsrc2 = v->config.rsrc2;
6509
6510 if (!nggc_settings) {
6511 /* Allocate less LDS when culling is disabled. (But GS always needs it.) */
6512 if (stage != MESA_SHADER_GEOMETRY)
6513 rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling);
6514 }
6515
6516 /* When the pipeline is dirty and not yet emitted, don't write it here
6517 * because radv_emit_graphics_pipeline will overwrite this register.
6518 */
6519 if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
6520 cmd_buffer->state.emitted_pipeline == pipeline) {
6521 radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
6522 }
6523 }
6524
6525 cmd_buffer->state.last_nggc_settings = nggc_settings;
6526 cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx;
6527 }
6528
6529 static void
radv_emit_all_graphics_states(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,bool pipeline_is_dirty)6530 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6531 bool pipeline_is_dirty)
6532 {
6533 bool late_scissor_emission;
6534
6535 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
6536 cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
6537 radv_emit_rbplus_state(cmd_buffer);
6538
6539 if (cmd_buffer->device->physical_device->use_ngg_culling &&
6540 cmd_buffer->state.pipeline->graphics.is_ngg)
6541 radv_emit_ngg_culling_state(cmd_buffer, info);
6542
6543 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
6544 radv_emit_graphics_pipeline(cmd_buffer);
6545
6546 /* This should be before the cmd_buffer->state.dirty is cleared
6547 * (excluding RADV_CMD_DIRTY_PIPELINE) and after
6548 * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
6549 late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info);
6550
6551 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
6552 radv_emit_framebuffer_state(cmd_buffer);
6553
6554 if (info->indexed) {
6555 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
6556 radv_emit_index_buffer(cmd_buffer, info->indirect);
6557 } else {
6558 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
6559 * so the state must be re-emitted before the next indexed
6560 * draw.
6561 */
6562 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
6563 cmd_buffer->state.last_index_type = -1;
6564 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
6565 }
6566 }
6567
6568 radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty);
6569
6570 radv_emit_draw_registers(cmd_buffer, info);
6571
6572 if (late_scissor_emission)
6573 radv_emit_scissor(cmd_buffer);
6574 }
6575
6576 /* MUST inline this function to avoid massive perf loss in drawoverhead */
6577 ALWAYS_INLINE static bool
radv_before_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount)6578 radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount)
6579 {
6580 const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
6581 const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
6582 cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
6583
6584 ASSERTED const unsigned cdw_max =
6585 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
6586
6587 if (likely(!info->indirect)) {
6588 /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
6589 * no workaround for indirect draws, but we can at least skip
6590 * direct draws.
6591 */
6592 if (unlikely(!info->instance_count))
6593 return false;
6594
6595 /* Handle count == 0. */
6596 if (unlikely(!info->count && !info->strmout_buffer))
6597 return false;
6598 }
6599
6600 /* Need to apply this workaround early as it can set flush flags. */
6601 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
6602 radv_emit_fb_mip_change_flush(cmd_buffer);
6603
6604 /* Use optimal packet order based on whether we need to sync the
6605 * pipeline.
6606 */
6607 if (cmd_buffer->state.flush_bits &
6608 (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
6609 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
6610 /* If we have to wait for idle, set all states first, so that
6611 * all SET packets are processed in parallel with previous draw
6612 * calls. Then upload descriptors, set shader pointers, and
6613 * draw, and prefetch at the end. This ensures that the time
6614 * the CUs are idle is very short. (there are only SET_SH
6615 * packets between the wait and the draw)
6616 */
6617 radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
6618 si_emit_cache_flush(cmd_buffer);
6619 /* <-- CUs are idle here --> */
6620
6621 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
6622 } else {
6623 /* If we don't wait for idle, start prefetches first, then set
6624 * states, and draw at the end.
6625 */
6626 si_emit_cache_flush(cmd_buffer);
6627
6628 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
6629 /* Only prefetch the vertex shader and VBO descriptors
6630 * in order to start the draw as soon as possible.
6631 */
6632 radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, true);
6633 }
6634
6635 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
6636
6637 radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
6638 }
6639
6640 radv_describe_draw(cmd_buffer);
6641 if (likely(!info->indirect)) {
6642 struct radv_cmd_state *state = &cmd_buffer->state;
6643 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6644 assert(state->pipeline->graphics.vtx_base_sgpr);
6645 if (state->last_num_instances != info->instance_count) {
6646 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
6647 radeon_emit(cs, info->instance_count);
6648 state->last_num_instances = info->instance_count;
6649 }
6650 }
6651 assert(cmd_buffer->cs->cdw <= cdw_max);
6652
6653 return true;
6654 }
6655
6656 static void
radv_after_draw(struct radv_cmd_buffer * cmd_buffer)6657 radv_after_draw(struct radv_cmd_buffer *cmd_buffer)
6658 {
6659 const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
6660 bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
6661 /* Start prefetches after the draw has been started. Both will
6662 * run in parallel, but starting the draw first is more
6663 * important.
6664 */
6665 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
6666 radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, false);
6667 }
6668
6669 /* Workaround for a VGT hang when streamout is enabled.
6670 * It must be done after drawing.
6671 */
6672 if (cmd_buffer->state.streamout.streamout_enabled &&
6673 (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA ||
6674 rad_info->family == CHIP_FIJI)) {
6675 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
6676 }
6677
6678 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
6679 }
6680
6681 void
radv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)6682 radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
6683 uint32_t firstVertex, uint32_t firstInstance)
6684 {
6685 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6686 struct radv_draw_info info;
6687
6688 info.count = vertexCount;
6689 info.instance_count = instanceCount;
6690 info.first_instance = firstInstance;
6691 info.strmout_buffer = NULL;
6692 info.indirect = NULL;
6693 info.indexed = false;
6694
6695 if (!radv_before_draw(cmd_buffer, &info, 1))
6696 return;
6697 const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount };
6698 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
6699 radv_after_draw(cmd_buffer);
6700 }
6701
6702 void
radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)6703 radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
6704 uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
6705 {
6706 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6707 struct radv_draw_info info;
6708
6709 if (!drawCount)
6710 return;
6711
6712 info.count = pVertexInfo->vertexCount;
6713 info.instance_count = instanceCount;
6714 info.first_instance = firstInstance;
6715 info.strmout_buffer = NULL;
6716 info.indirect = NULL;
6717 info.indexed = false;
6718
6719 if (!radv_before_draw(cmd_buffer, &info, drawCount))
6720 return;
6721 radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
6722 radv_after_draw(cmd_buffer);
6723 }
6724
6725 void
radv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)6726 radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
6727 uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)
6728 {
6729 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6730 struct radv_draw_info info;
6731
6732 info.indexed = true;
6733 info.count = indexCount;
6734 info.instance_count = instanceCount;
6735 info.first_instance = firstInstance;
6736 info.strmout_buffer = NULL;
6737 info.indirect = NULL;
6738
6739 if (!radv_before_draw(cmd_buffer, &info, 1))
6740 return;
6741 const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset };
6742 radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
6743 radv_after_draw(cmd_buffer);
6744 }
6745
radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)6746 void radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo,
6747 uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset)
6748 {
6749 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6750 struct radv_draw_info info;
6751
6752 if (!drawCount)
6753 return;
6754
6755 const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
6756 info.indexed = true;
6757 info.count = minfo->indexCount;
6758 info.instance_count = instanceCount;
6759 info.first_instance = firstInstance;
6760 info.strmout_buffer = NULL;
6761 info.indirect = NULL;
6762
6763 if (!radv_before_draw(cmd_buffer, &info, drawCount))
6764 return;
6765 radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
6766 radv_after_draw(cmd_buffer);
6767 }
6768
6769 void
radv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6770 radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
6771 uint32_t drawCount, uint32_t stride)
6772 {
6773 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6774 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6775 struct radv_draw_info info;
6776
6777 info.count = drawCount;
6778 info.indirect = buffer;
6779 info.indirect_offset = offset;
6780 info.stride = stride;
6781 info.strmout_buffer = NULL;
6782 info.count_buffer = NULL;
6783 info.indexed = false;
6784 info.instance_count = 0;
6785
6786 if (!radv_before_draw(cmd_buffer, &info, 1))
6787 return;
6788 radv_emit_indirect_draw_packets(cmd_buffer, &info);
6789 radv_after_draw(cmd_buffer);
6790 }
6791
6792 void
radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6793 radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
6794 uint32_t drawCount, uint32_t stride)
6795 {
6796 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6797 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6798 struct radv_draw_info info;
6799
6800 info.indexed = true;
6801 info.count = drawCount;
6802 info.indirect = buffer;
6803 info.indirect_offset = offset;
6804 info.stride = stride;
6805 info.count_buffer = NULL;
6806 info.strmout_buffer = NULL;
6807 info.instance_count = 0;
6808
6809 if (!radv_before_draw(cmd_buffer, &info, 1))
6810 return;
6811 radv_emit_indirect_draw_packets(cmd_buffer, &info);
6812 radv_after_draw(cmd_buffer);
6813 }
6814
6815 void
radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)6816 radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
6817 VkBuffer _countBuffer, VkDeviceSize countBufferOffset,
6818 uint32_t maxDrawCount, uint32_t stride)
6819 {
6820 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6821 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6822 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
6823 struct radv_draw_info info;
6824
6825 info.count = maxDrawCount;
6826 info.indirect = buffer;
6827 info.indirect_offset = offset;
6828 info.count_buffer = count_buffer;
6829 info.count_buffer_offset = countBufferOffset;
6830 info.stride = stride;
6831 info.strmout_buffer = NULL;
6832 info.indexed = false;
6833 info.instance_count = 0;
6834
6835 if (!radv_before_draw(cmd_buffer, &info, 1))
6836 return;
6837 radv_emit_indirect_draw_packets(cmd_buffer, &info);
6838 radv_after_draw(cmd_buffer);
6839 }
6840
6841 void
radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)6842 radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
6843 VkDeviceSize offset, VkBuffer _countBuffer,
6844 VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
6845 uint32_t stride)
6846 {
6847 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6848 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6849 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
6850 struct radv_draw_info info;
6851
6852 info.indexed = true;
6853 info.count = maxDrawCount;
6854 info.indirect = buffer;
6855 info.indirect_offset = offset;
6856 info.count_buffer = count_buffer;
6857 info.count_buffer_offset = countBufferOffset;
6858 info.stride = stride;
6859 info.strmout_buffer = NULL;
6860 info.instance_count = 0;
6861
6862 if (!radv_before_draw(cmd_buffer, &info, 1))
6863 return;
6864 radv_emit_indirect_draw_packets(cmd_buffer, &info);
6865 radv_after_draw(cmd_buffer);
6866 }
6867
6868 struct radv_dispatch_info {
6869 /**
6870 * Determine the layout of the grid (in block units) to be used.
6871 */
6872 uint32_t blocks[3];
6873
6874 /**
6875 * A starting offset for the grid. If unaligned is set, the offset
6876 * must still be aligned.
6877 */
6878 uint32_t offsets[3];
6879 /**
6880 * Whether it's an unaligned compute dispatch.
6881 */
6882 bool unaligned;
6883
6884 /**
6885 * Indirect compute parameters resource.
6886 */
6887 struct radeon_winsys_bo *indirect;
6888 uint64_t va;
6889 };
6890
6891 static void
radv_emit_dispatch_packets(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,const struct radv_dispatch_info * info)6892 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
6893 const struct radv_dispatch_info *info)
6894 {
6895 struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
6896 unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
6897 struct radeon_winsys *ws = cmd_buffer->device->ws;
6898 bool predicating = cmd_buffer->state.predicating;
6899 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6900 struct radv_userdata_info *loc;
6901
6902 radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]);
6903
6904 loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
6905
6906 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 25);
6907
6908 if (compute_shader->info.wave_size == 32) {
6909 assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
6910 dispatch_initiator |= S_00B800_CS_W32_EN(1);
6911 }
6912
6913 if (info->indirect) {
6914 radv_cs_add_buffer(ws, cs, info->indirect);
6915
6916 if (loc->sgpr_idx != -1) {
6917 for (unsigned i = 0; i < 3; ++i) {
6918 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6919 radeon_emit(cs,
6920 COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG));
6921 radeon_emit(cs, (info->va + 4 * i));
6922 radeon_emit(cs, (info->va + 4 * i) >> 32);
6923 radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i);
6924 radeon_emit(cs, 0);
6925 }
6926 }
6927
6928 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
6929 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) | PKT3_SHADER_TYPE_S(1));
6930 radeon_emit(cs, info->va);
6931 radeon_emit(cs, info->va >> 32);
6932 radeon_emit(cs, dispatch_initiator);
6933 } else {
6934 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
6935 radeon_emit(cs, 1);
6936 radeon_emit(cs, info->va);
6937 radeon_emit(cs, info->va >> 32);
6938
6939 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
6940 radeon_emit(cs, 0);
6941 radeon_emit(cs, dispatch_initiator);
6942 }
6943 } else {
6944 unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
6945 unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
6946
6947 if (info->unaligned) {
6948 unsigned *cs_block_size = compute_shader->info.cs.block_size;
6949 unsigned remainder[3];
6950
6951 /* If aligned, these should be an entire block size,
6952 * not 0.
6953 */
6954 remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]);
6955 remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]);
6956 remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]);
6957
6958 blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
6959 blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
6960 blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
6961
6962 for (unsigned i = 0; i < 3; ++i) {
6963 assert(offsets[i] % cs_block_size[i] == 0);
6964 offsets[i] /= cs_block_size[i];
6965 }
6966
6967 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
6968 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
6969 S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
6970 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
6971 S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
6972 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
6973 S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
6974
6975 dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
6976 }
6977
6978 if (loc->sgpr_idx != -1) {
6979 assert(loc->num_sgprs == 3);
6980
6981 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
6982 radeon_emit(cs, blocks[0]);
6983 radeon_emit(cs, blocks[1]);
6984 radeon_emit(cs, blocks[2]);
6985 }
6986
6987 if (offsets[0] || offsets[1] || offsets[2]) {
6988 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
6989 radeon_emit(cs, offsets[0]);
6990 radeon_emit(cs, offsets[1]);
6991 radeon_emit(cs, offsets[2]);
6992
6993 /* The blocks in the packet are not counts but end values. */
6994 for (unsigned i = 0; i < 3; ++i)
6995 blocks[i] += offsets[i];
6996 } else {
6997 dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
6998 }
6999
7000 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
7001 radeon_emit(cs, blocks[0]);
7002 radeon_emit(cs, blocks[1]);
7003 radeon_emit(cs, blocks[2]);
7004 radeon_emit(cs, dispatch_initiator);
7005 }
7006
7007 assert(cmd_buffer->cs->cdw <= cdw_max);
7008 }
7009
7010 static void
radv_upload_compute_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)7011 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer,
7012 struct radv_pipeline *pipeline,
7013 VkPipelineBindPoint bind_point)
7014 {
7015 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, pipeline, bind_point);
7016 radv_flush_constants(cmd_buffer,
7017 bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
7018 ? RADV_RT_STAGE_BITS
7019 : VK_SHADER_STAGE_COMPUTE_BIT,
7020 pipeline, bind_point);
7021 }
7022
7023 static void
radv_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)7024 radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
7025 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
7026 {
7027 bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
7028 bool pipeline_is_dirty = pipeline && pipeline != cmd_buffer->state.emitted_compute_pipeline;
7029 bool cs_regalloc_hang = cmd_buffer->device->physical_device->rad_info.has_cs_regalloc_hang_bug &&
7030 info->blocks[0] * info->blocks[1] * info->blocks[2] > 256;
7031
7032 if (cs_regalloc_hang)
7033 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
7034 RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
7035
7036 if (cmd_buffer->state.flush_bits &
7037 (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
7038 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
7039 /* If we have to wait for idle, set all states first, so that
7040 * all SET packets are processed in parallel with previous draw
7041 * calls. Then upload descriptors, set shader pointers, and
7042 * dispatch, and prefetch at the end. This ensures that the
7043 * time the CUs are idle is very short. (there are only SET_SH
7044 * packets between the wait and the draw)
7045 */
7046 radv_emit_compute_pipeline(cmd_buffer, pipeline);
7047 si_emit_cache_flush(cmd_buffer);
7048 /* <-- CUs are idle here --> */
7049
7050 radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
7051
7052 radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
7053 /* <-- CUs are busy here --> */
7054
7055 /* Start prefetches after the dispatch has been started. Both
7056 * will run in parallel, but starting the dispatch first is
7057 * more important.
7058 */
7059 if (has_prefetch && pipeline_is_dirty) {
7060 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]);
7061 }
7062 } else {
7063 /* If we don't wait for idle, start prefetches first, then set
7064 * states, and dispatch at the end.
7065 */
7066 si_emit_cache_flush(cmd_buffer);
7067
7068 if (has_prefetch && pipeline_is_dirty) {
7069 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]);
7070 }
7071
7072 radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
7073
7074 radv_emit_compute_pipeline(cmd_buffer, pipeline);
7075 radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
7076 }
7077
7078 if (pipeline_is_dirty) {
7079 /* Raytracing uses compute shaders but has separate bind points and pipelines.
7080 * So if we set compute userdata & shader registers we should dirty the raytracing
7081 * ones and the other way around.
7082 *
7083 * We only need to do this when the pipeline is dirty because when we switch between
7084 * the two we always need to switch pipelines.
7085 */
7086 radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
7087 ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
7088 : VK_PIPELINE_BIND_POINT_COMPUTE);
7089 }
7090
7091 if (cs_regalloc_hang)
7092 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
7093
7094 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
7095 }
7096
7097 static void
radv_compute_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)7098 radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
7099 {
7100 radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline,
7101 VK_PIPELINE_BIND_POINT_COMPUTE);
7102 }
7103
7104 void
radv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t base_x,uint32_t base_y,uint32_t base_z,uint32_t x,uint32_t y,uint32_t z)7105 radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y,
7106 uint32_t base_z, uint32_t x, uint32_t y, uint32_t z)
7107 {
7108 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7109 struct radv_dispatch_info info = {0};
7110
7111 info.blocks[0] = x;
7112 info.blocks[1] = y;
7113 info.blocks[2] = z;
7114
7115 info.offsets[0] = base_x;
7116 info.offsets[1] = base_y;
7117 info.offsets[2] = base_z;
7118 radv_compute_dispatch(cmd_buffer, &info);
7119 }
7120
7121 void
radv_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t x,uint32_t y,uint32_t z)7122 radv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
7123 {
7124 radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
7125 }
7126
7127 void
radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)7128 radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
7129 {
7130 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7131 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
7132 struct radv_dispatch_info info = {0};
7133
7134 info.indirect = buffer->bo;
7135 info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
7136
7137 radv_compute_dispatch(cmd_buffer, &info);
7138 }
7139
7140 void
radv_unaligned_dispatch(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)7141 radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
7142 {
7143 struct radv_dispatch_info info = {0};
7144
7145 info.blocks[0] = x;
7146 info.blocks[1] = y;
7147 info.blocks[2] = z;
7148 info.unaligned = 1;
7149
7150 radv_compute_dispatch(cmd_buffer, &info);
7151 }
7152
7153 void
radv_indirect_dispatch(struct radv_cmd_buffer * cmd_buffer,struct radeon_winsys_bo * bo,uint64_t va)7154 radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
7155 {
7156 struct radv_dispatch_info info = {0};
7157
7158 info.indirect = bo;
7159 info.va = va;
7160
7161 radv_compute_dispatch(cmd_buffer, &info);
7162 }
7163
7164 static void
radv_rt_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)7165 radv_rt_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
7166 {
7167 radv_dispatch(cmd_buffer, info, cmd_buffer->state.rt_pipeline,
7168 VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
7169 }
7170
7171 static bool
radv_rt_bind_tables(struct radv_cmd_buffer * cmd_buffer,const VkStridedDeviceAddressRegionKHR * tables)7172 radv_rt_bind_tables(struct radv_cmd_buffer *cmd_buffer,
7173 const VkStridedDeviceAddressRegionKHR *tables)
7174 {
7175 struct radv_pipeline *pipeline = cmd_buffer->state.rt_pipeline;
7176 uint32_t base_reg;
7177 void *ptr;
7178 uint32_t *desc_ptr;
7179 uint32_t offset;
7180
7181 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 64, &offset, &ptr))
7182 return false;
7183
7184 desc_ptr = ptr;
7185 for (unsigned i = 0; i < 4; ++i, desc_ptr += 4) {
7186 desc_ptr[0] = tables[i].deviceAddress;
7187 desc_ptr[1] = tables[i].deviceAddress >> 32;
7188 desc_ptr[2] = tables[i].stride;
7189 desc_ptr[3] = 0;
7190 }
7191
7192 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
7193 struct radv_userdata_info *loc =
7194 radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS);
7195 if (loc->sgpr_idx == -1)
7196 return true;
7197
7198 base_reg = pipeline->user_data_0[MESA_SHADER_COMPUTE];
7199 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
7200 false);
7201 return true;
7202 }
7203
7204 void
radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,uint32_t width,uint32_t height,uint32_t depth)7205 radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,
7206 const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
7207 const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
7208 const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
7209 const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
7210 uint32_t width, uint32_t height, uint32_t depth)
7211 {
7212 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7213 struct radv_dispatch_info info = {0};
7214
7215 info.blocks[0] = width;
7216 info.blocks[1] = height;
7217 info.blocks[2] = depth;
7218 info.unaligned = 1;
7219
7220 const VkStridedDeviceAddressRegionKHR tables[] = {
7221 *pRaygenShaderBindingTable,
7222 *pMissShaderBindingTable,
7223 *pHitShaderBindingTable,
7224 *pCallableShaderBindingTable,
7225 };
7226
7227 if (!radv_rt_bind_tables(cmd_buffer, tables)) {
7228 return;
7229 }
7230
7231 struct radv_userdata_info *loc = radv_lookup_user_sgpr(
7232 cmd_buffer->state.rt_pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE);
7233
7234 if (loc->sgpr_idx != -1) {
7235 assert(loc->num_sgprs == 3);
7236
7237 radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
7238 radeon_emit(cmd_buffer->cs, width);
7239 radeon_emit(cmd_buffer->cs, height);
7240 radeon_emit(cmd_buffer->cs, depth);
7241 }
7242
7243 radv_rt_dispatch(cmd_buffer, &info);
7244 }
7245
7246 static void
radv_set_rt_stack_size(struct radv_cmd_buffer * cmd_buffer,uint32_t size)7247 radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size)
7248 {
7249 unsigned wave_size = 0;
7250 unsigned scratch_bytes_per_wave = 0;
7251
7252 if (cmd_buffer->state.rt_pipeline) {
7253 scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->scratch_bytes_per_wave;
7254 wave_size = cmd_buffer->state.rt_pipeline->shaders[MESA_SHADER_COMPUTE]->info.wave_size;
7255 }
7256
7257 /* The hardware register is specified as a multiple of 256 DWORDS. */
7258 scratch_bytes_per_wave += align(size * wave_size, 1024);
7259
7260 cmd_buffer->compute_scratch_size_per_wave_needed =
7261 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
7262 }
7263
7264 void
radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer,uint32_t size)7265 radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
7266 {
7267 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7268
7269 radv_set_rt_stack_size(cmd_buffer, size);
7270 cmd_buffer->state.rt_stack_size = size;
7271 }
7272
7273 void
radv_cmd_buffer_end_render_pass(struct radv_cmd_buffer * cmd_buffer)7274 radv_cmd_buffer_end_render_pass(struct radv_cmd_buffer *cmd_buffer)
7275 {
7276 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
7277 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
7278
7279 cmd_buffer->state.pass = NULL;
7280 cmd_buffer->state.subpass = NULL;
7281 cmd_buffer->state.attachments = NULL;
7282 cmd_buffer->state.framebuffer = NULL;
7283 cmd_buffer->state.subpass_sample_locs = NULL;
7284 }
7285
7286 void
radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)7287 radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo)
7288 {
7289 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7290
7291 radv_mark_noncoherent_rb(cmd_buffer);
7292
7293 radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
7294
7295 radv_cmd_buffer_end_subpass(cmd_buffer);
7296
7297 radv_cmd_buffer_end_render_pass(cmd_buffer);
7298 }
7299
7300 /*
7301 * For HTILE we have the following interesting clear words:
7302 * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
7303 * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
7304 * 0xfffffff0: Clear depth to 1.0
7305 * 0x00000000: Clear depth to 0.0
7306 */
7307 static void
radv_initialize_htile(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)7308 radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7309 const VkImageSubresourceRange *range)
7310 {
7311 struct radv_cmd_state *state = &cmd_buffer->state;
7312 uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
7313 VkClearDepthStencilValue value = {0};
7314 struct radv_barrier_data barrier = {0};
7315
7316 barrier.layout_transitions.init_mask_ram = 1;
7317 radv_describe_layout_transition(cmd_buffer, &barrier);
7318
7319 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
7320 * in considering previous rendering work for WAW hazards. */
7321 state->flush_bits |=
7322 radv_src_access_flush(cmd_buffer, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
7323
7324 if (image->planes[0].surface.has_stencil &&
7325 !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
7326 /* Flush caches before performing a separate aspect initialization because it's a
7327 * read-modify-write operation.
7328 */
7329 state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_SHADER_READ_BIT, image);
7330 }
7331
7332 state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
7333
7334 radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
7335
7336 if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
7337 /* Initialize the TC-compat metada value to 0 because by
7338 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
7339 * need have to conditionally update its value when performing
7340 * a fast depth clear.
7341 */
7342 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
7343 }
7344 }
7345
7346 static void
radv_handle_depth_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)7347 radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7348 VkImageLayout src_layout, bool src_render_loop,
7349 VkImageLayout dst_layout, bool dst_render_loop,
7350 unsigned src_queue_mask, unsigned dst_queue_mask,
7351 const VkImageSubresourceRange *range,
7352 struct radv_sample_locations_state *sample_locs)
7353 {
7354 struct radv_device *device = cmd_buffer->device;
7355
7356 if (!radv_htile_enabled(image, range->baseMipLevel))
7357 return;
7358
7359 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
7360 radv_initialize_htile(cmd_buffer, image, range);
7361 } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
7362 src_queue_mask) &&
7363 radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
7364 dst_queue_mask)) {
7365 radv_initialize_htile(cmd_buffer, image, range);
7366 } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
7367 src_queue_mask) &&
7368 !radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
7369 dst_queue_mask)) {
7370 cmd_buffer->state.flush_bits |=
7371 RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
7372
7373 radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
7374
7375 cmd_buffer->state.flush_bits |=
7376 RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
7377 }
7378 }
7379
7380 static uint32_t
radv_init_cmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)7381 radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7382 const VkImageSubresourceRange *range, uint32_t value)
7383 {
7384 struct radv_barrier_data barrier = {0};
7385
7386 barrier.layout_transitions.init_mask_ram = 1;
7387 radv_describe_layout_transition(cmd_buffer, &barrier);
7388
7389 return radv_clear_cmask(cmd_buffer, image, range, value);
7390 }
7391
7392 uint32_t
radv_init_fmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)7393 radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7394 const VkImageSubresourceRange *range)
7395 {
7396 static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
7397 uint32_t log2_samples = util_logbase2(image->info.samples);
7398 uint32_t value = fmask_clear_values[log2_samples];
7399 struct radv_barrier_data barrier = {0};
7400
7401 barrier.layout_transitions.init_mask_ram = 1;
7402 radv_describe_layout_transition(cmd_buffer, &barrier);
7403
7404 return radv_clear_fmask(cmd_buffer, image, range, value);
7405 }
7406
7407 uint32_t
radv_init_dcc(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)7408 radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7409 const VkImageSubresourceRange *range, uint32_t value)
7410 {
7411 struct radv_barrier_data barrier = {0};
7412 uint32_t flush_bits = 0;
7413 unsigned size = 0;
7414
7415 barrier.layout_transitions.init_mask_ram = 1;
7416 radv_describe_layout_transition(cmd_buffer, &barrier);
7417
7418 flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
7419
7420 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX8) {
7421 /* When DCC is enabled with mipmaps, some levels might not
7422 * support fast clears and we have to initialize them as "fully
7423 * expanded".
7424 */
7425 /* Compute the size of all fast clearable DCC levels. */
7426 for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
7427 struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
7428 unsigned dcc_fast_clear_size =
7429 dcc_level->dcc_slice_fast_clear_size * image->info.array_size;
7430
7431 if (!dcc_fast_clear_size)
7432 break;
7433
7434 size = dcc_level->dcc_offset + dcc_fast_clear_size;
7435 }
7436
7437 /* Initialize the mipmap levels without DCC. */
7438 if (size != image->planes[0].surface.meta_size) {
7439 flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bo,
7440 image->offset + image->planes[0].surface.meta_offset + size,
7441 image->planes[0].surface.meta_size - size, 0xffffffff);
7442 }
7443 }
7444
7445 return flush_bits;
7446 }
7447
7448 /**
7449 * Initialize DCC/FMASK/CMASK metadata for a color image.
7450 */
7451 static void
radv_init_color_image_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)7452 radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7453 VkImageLayout src_layout, bool src_render_loop,
7454 VkImageLayout dst_layout, bool dst_render_loop,
7455 unsigned src_queue_mask, unsigned dst_queue_mask,
7456 const VkImageSubresourceRange *range)
7457 {
7458 uint32_t flush_bits = 0;
7459
7460 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
7461 * consistent in considering previous rendering work for WAW hazards.
7462 */
7463 cmd_buffer->state.flush_bits |=
7464 radv_src_access_flush(cmd_buffer, VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, image);
7465
7466 if (radv_image_has_cmask(image)) {
7467 uint32_t value;
7468
7469 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
7470 /* TODO: Fix clearing CMASK layers on GFX9. */
7471 if (radv_image_is_tc_compat_cmask(image) ||
7472 (radv_image_has_fmask(image) &&
7473 radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
7474 dst_render_loop, dst_queue_mask))) {
7475 value = 0xccccccccu;
7476 } else {
7477 value = 0xffffffffu;
7478 }
7479 } else {
7480 static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
7481 uint32_t log2_samples = util_logbase2(image->info.samples);
7482
7483 value = cmask_clear_values[log2_samples];
7484 }
7485
7486 flush_bits |= radv_init_cmask(cmd_buffer, image, range, value);
7487 }
7488
7489 if (radv_image_has_fmask(image)) {
7490 flush_bits |= radv_init_fmask(cmd_buffer, image, range);
7491 }
7492
7493 if (radv_dcc_enabled(image, range->baseMipLevel)) {
7494 uint32_t value = 0xffffffffu; /* Fully expanded mode. */
7495
7496 if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
7497 dst_layout, dst_render_loop, dst_queue_mask)) {
7498 value = 0u;
7499 }
7500
7501 flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
7502 }
7503
7504 if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
7505 radv_update_fce_metadata(cmd_buffer, image, range, false);
7506
7507 uint32_t color_values[2] = {0};
7508 radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
7509 }
7510
7511 cmd_buffer->state.flush_bits |= flush_bits;
7512 }
7513
7514 static void
radv_retile_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned dst_queue_mask)7515 radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7516 VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask)
7517 {
7518 if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
7519 (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR ||
7520 (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
7521 radv_retile_dcc(cmd_buffer, image);
7522 }
7523
7524 static bool
radv_image_need_retile(const struct radv_image * image)7525 radv_image_need_retile(const struct radv_image *image)
7526 {
7527 return image->planes[0].surface.display_dcc_offset &&
7528 image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
7529 }
7530
7531 /**
7532 * Handle color image transitions for DCC/FMASK/CMASK.
7533 */
7534 static void
radv_handle_color_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)7535 radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7536 VkImageLayout src_layout, bool src_render_loop,
7537 VkImageLayout dst_layout, bool dst_render_loop,
7538 unsigned src_queue_mask, unsigned dst_queue_mask,
7539 const VkImageSubresourceRange *range)
7540 {
7541 bool dcc_decompressed = false, fast_clear_flushed = false;
7542
7543 if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) &&
7544 !radv_dcc_enabled(image, range->baseMipLevel))
7545 return;
7546
7547 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
7548 radv_init_color_image_metadata(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
7549 dst_render_loop, src_queue_mask, dst_queue_mask, range);
7550
7551 if (radv_image_need_retile(image))
7552 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
7553 return;
7554 }
7555
7556 if (radv_dcc_enabled(image, range->baseMipLevel)) {
7557 if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
7558 cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
7559 } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
7560 src_layout, src_render_loop, src_queue_mask) &&
7561 !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
7562 dst_layout, dst_render_loop, dst_queue_mask)) {
7563 radv_decompress_dcc(cmd_buffer, image, range);
7564 dcc_decompressed = true;
7565 } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
7566 src_layout, src_render_loop, src_queue_mask) &&
7567 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
7568 dst_layout, dst_render_loop, dst_queue_mask)) {
7569 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
7570 fast_clear_flushed = true;
7571 }
7572
7573 if (radv_image_need_retile(image))
7574 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
7575 } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
7576 if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
7577 src_layout, src_render_loop, src_queue_mask) &&
7578 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
7579 dst_layout, dst_render_loop, dst_queue_mask)) {
7580 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
7581 fast_clear_flushed = true;
7582 }
7583 }
7584
7585 /* MSAA color decompress. */
7586 if (radv_image_has_fmask(image) &&
7587 (image->usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) &&
7588 radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) &&
7589 !radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) {
7590 if (radv_dcc_enabled(image, range->baseMipLevel) &&
7591 !radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) {
7592 /* A DCC decompress is required before expanding FMASK
7593 * when DCC stores aren't supported to avoid being in
7594 * a state where DCC is compressed and the main
7595 * surface is uncompressed.
7596 */
7597 radv_decompress_dcc(cmd_buffer, image, range);
7598 } else if (!fast_clear_flushed) {
7599 /* A FMASK decompress is required before expanding
7600 * FMASK.
7601 */
7602 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
7603 }
7604
7605 struct radv_barrier_data barrier = {0};
7606 barrier.layout_transitions.fmask_color_expand = 1;
7607 radv_describe_layout_transition(cmd_buffer, &barrier);
7608
7609 radv_expand_fmask_image_inplace(cmd_buffer, image, range);
7610 }
7611 }
7612
7613 static void
radv_handle_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,uint32_t src_family,uint32_t dst_family,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)7614 radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7615 VkImageLayout src_layout, bool src_render_loop,
7616 VkImageLayout dst_layout, bool dst_render_loop, uint32_t src_family,
7617 uint32_t dst_family, const VkImageSubresourceRange *range,
7618 struct radv_sample_locations_state *sample_locs)
7619 {
7620 if (image->exclusive && src_family != dst_family) {
7621 /* This is an acquire or a release operation and there will be
7622 * a corresponding release/acquire. Do the transition in the
7623 * most flexible queue. */
7624
7625 assert(src_family == cmd_buffer->queue_family_index ||
7626 dst_family == cmd_buffer->queue_family_index);
7627
7628 if (src_family == VK_QUEUE_FAMILY_EXTERNAL || src_family == VK_QUEUE_FAMILY_FOREIGN_EXT)
7629 return;
7630
7631 if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
7632 return;
7633
7634 if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
7635 (src_family == RADV_QUEUE_GENERAL || dst_family == RADV_QUEUE_GENERAL))
7636 return;
7637 }
7638
7639 unsigned src_queue_mask =
7640 radv_image_queue_family_mask(image, src_family, cmd_buffer->queue_family_index);
7641 unsigned dst_queue_mask =
7642 radv_image_queue_family_mask(image, dst_family, cmd_buffer->queue_family_index);
7643
7644 if (src_layout == dst_layout && src_render_loop == dst_render_loop && src_queue_mask == dst_queue_mask)
7645 return;
7646
7647 if (vk_format_has_depth(image->vk_format)) {
7648 radv_handle_depth_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
7649 dst_render_loop, src_queue_mask, dst_queue_mask, range,
7650 sample_locs);
7651 } else {
7652 radv_handle_color_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
7653 dst_render_loop, src_queue_mask, dst_queue_mask, range);
7654 }
7655 }
7656
7657 struct radv_barrier_info {
7658 enum rgp_barrier_reason reason;
7659 uint32_t eventCount;
7660 const VkEvent *pEvents;
7661 VkPipelineStageFlags srcStageMask;
7662 VkPipelineStageFlags dstStageMask;
7663 };
7664
7665 static void
radv_barrier(struct radv_cmd_buffer * cmd_buffer,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers,const struct radv_barrier_info * info)7666 radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount,
7667 const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount,
7668 const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount,
7669 const VkImageMemoryBarrier *pImageMemoryBarriers, const struct radv_barrier_info *info)
7670 {
7671 struct radeon_cmdbuf *cs = cmd_buffer->cs;
7672 enum radv_cmd_flush_bits src_flush_bits = 0;
7673 enum radv_cmd_flush_bits dst_flush_bits = 0;
7674
7675 if (cmd_buffer->state.subpass)
7676 radv_mark_noncoherent_rb(cmd_buffer);
7677
7678 radv_describe_barrier_start(cmd_buffer, info->reason);
7679
7680 for (unsigned i = 0; i < info->eventCount; ++i) {
7681 RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]);
7682 uint64_t va = radv_buffer_get_va(event->bo);
7683
7684 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
7685
7686 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
7687
7688 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
7689 assert(cmd_buffer->cs->cdw <= cdw_max);
7690 }
7691
7692 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
7693 src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask, NULL);
7694 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask, NULL);
7695 }
7696
7697 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
7698 src_flush_bits |=
7699 radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask, NULL);
7700 dst_flush_bits |=
7701 radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask, NULL);
7702 }
7703
7704 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
7705 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
7706
7707 src_flush_bits |=
7708 radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask, image);
7709 dst_flush_bits |=
7710 radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask, image);
7711 }
7712
7713 /* The Vulkan spec 1.1.98 says:
7714 *
7715 * "An execution dependency with only
7716 * VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask
7717 * will only prevent that stage from executing in subsequently
7718 * submitted commands. As this stage does not perform any actual
7719 * execution, this is not observable - in effect, it does not delay
7720 * processing of subsequent commands. Similarly an execution dependency
7721 * with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask
7722 * will effectively not wait for any prior commands to complete."
7723 */
7724 if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
7725 radv_stage_flush(cmd_buffer, info->srcStageMask);
7726 cmd_buffer->state.flush_bits |= src_flush_bits;
7727
7728 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
7729 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
7730
7731 const struct VkSampleLocationsInfoEXT *sample_locs_info =
7732 vk_find_struct_const(pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
7733 struct radv_sample_locations_state sample_locations = {0};
7734
7735 if (sample_locs_info) {
7736 assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
7737 sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
7738 sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
7739 sample_locations.count = sample_locs_info->sampleLocationsCount;
7740 typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
7741 sample_locs_info->sampleLocationsCount);
7742 }
7743
7744 radv_handle_image_transition(
7745 cmd_buffer, image, pImageMemoryBarriers[i].oldLayout,
7746 false, /* Outside of a renderpass we are never in a renderloop */
7747 pImageMemoryBarriers[i].newLayout,
7748 false, /* Outside of a renderpass we are never in a renderloop */
7749 pImageMemoryBarriers[i].srcQueueFamilyIndex, pImageMemoryBarriers[i].dstQueueFamilyIndex,
7750 &pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
7751 }
7752
7753 /* Make sure CP DMA is idle because the driver might have performed a
7754 * DMA operation for copying or filling buffers/images.
7755 */
7756 if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
7757 si_cp_dma_wait_for_idle(cmd_buffer);
7758
7759 cmd_buffer->state.flush_bits |= dst_flush_bits;
7760
7761 radv_describe_barrier_end(cmd_buffer);
7762 }
7763
7764 void
radv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags destStageMask,VkBool32 byRegion,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers)7765 radv_CmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask,
7766 VkPipelineStageFlags destStageMask, VkBool32 byRegion,
7767 uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
7768 uint32_t bufferMemoryBarrierCount,
7769 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
7770 uint32_t imageMemoryBarrierCount,
7771 const VkImageMemoryBarrier *pImageMemoryBarriers)
7772 {
7773 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7774 struct radv_barrier_info info;
7775
7776 info.reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER;
7777 info.eventCount = 0;
7778 info.pEvents = NULL;
7779 info.srcStageMask = srcStageMask;
7780 info.dstStageMask = destStageMask;
7781
7782 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
7783 pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info);
7784 }
7785
7786 static void
write_event(struct radv_cmd_buffer * cmd_buffer,struct radv_event * event,VkPipelineStageFlags stageMask,unsigned value)7787 write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event,
7788 VkPipelineStageFlags stageMask, unsigned value)
7789 {
7790 struct radeon_cmdbuf *cs = cmd_buffer->cs;
7791 uint64_t va = radv_buffer_get_va(event->bo);
7792
7793 si_emit_cache_flush(cmd_buffer);
7794
7795 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
7796
7797 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
7798
7799 /* Flags that only require a top-of-pipe event. */
7800 VkPipelineStageFlags top_of_pipe_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
7801
7802 /* Flags that only require a post-index-fetch event. */
7803 VkPipelineStageFlags post_index_fetch_flags =
7804 top_of_pipe_flags | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
7805
7806 /* Flags that only require signaling post PS. */
7807 VkPipelineStageFlags post_ps_flags =
7808 post_index_fetch_flags | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
7809 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
7810 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
7811 VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT |
7812 VK_PIPELINE_STAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR |
7813 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
7814
7815 /* Flags that only require signaling post CS. */
7816 VkPipelineStageFlags post_cs_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
7817
7818 /* Make sure CP DMA is idle because the driver might have performed a
7819 * DMA operation for copying or filling buffers/images.
7820 */
7821 if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
7822 si_cp_dma_wait_for_idle(cmd_buffer);
7823
7824 if (!(stageMask & ~top_of_pipe_flags)) {
7825 /* Just need to sync the PFP engine. */
7826 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
7827 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
7828 radeon_emit(cs, va);
7829 radeon_emit(cs, va >> 32);
7830 radeon_emit(cs, value);
7831 } else if (!(stageMask & ~post_index_fetch_flags)) {
7832 /* Sync ME because PFP reads index and indirect buffers. */
7833 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
7834 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
7835 radeon_emit(cs, va);
7836 radeon_emit(cs, va >> 32);
7837 radeon_emit(cs, value);
7838 } else {
7839 unsigned event_type;
7840
7841 if (!(stageMask & ~post_ps_flags)) {
7842 /* Sync previous fragment shaders. */
7843 event_type = V_028A90_PS_DONE;
7844 } else if (!(stageMask & ~post_cs_flags)) {
7845 /* Sync previous compute shaders. */
7846 event_type = V_028A90_CS_DONE;
7847 } else {
7848 /* Otherwise, sync all prior GPU work. */
7849 event_type = V_028A90_BOTTOM_OF_PIPE_TS;
7850 }
7851
7852 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
7853 radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0,
7854 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value,
7855 cmd_buffer->gfx9_eop_bug_va);
7856 }
7857
7858 assert(cmd_buffer->cs->cdw <= cdw_max);
7859 }
7860
7861 void
radv_CmdSetEvent(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags stageMask)7862 radv_CmdSetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask)
7863 {
7864 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7865 RADV_FROM_HANDLE(radv_event, event, _event);
7866
7867 write_event(cmd_buffer, event, stageMask, 1);
7868 }
7869
7870 void
radv_CmdResetEvent(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags stageMask)7871 radv_CmdResetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask)
7872 {
7873 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7874 RADV_FROM_HANDLE(radv_event, event, _event);
7875
7876 write_event(cmd_buffer, event, stageMask, 0);
7877 }
7878
7879 void
radv_CmdWaitEvents(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags dstStageMask,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers)7880 radv_CmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
7881 VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
7882 uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
7883 uint32_t bufferMemoryBarrierCount,
7884 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
7885 uint32_t imageMemoryBarrierCount,
7886 const VkImageMemoryBarrier *pImageMemoryBarriers)
7887 {
7888 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7889 struct radv_barrier_info info;
7890
7891 info.reason = RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS;
7892 info.eventCount = eventCount;
7893 info.pEvents = pEvents;
7894 info.srcStageMask = 0;
7895
7896 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
7897 pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info);
7898 }
7899
7900 void
radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,uint32_t deviceMask)7901 radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
7902 {
7903 /* No-op */
7904 }
7905
7906 /* VK_EXT_conditional_rendering */
7907 void
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)7908 radv_CmdBeginConditionalRenderingEXT(
7909 VkCommandBuffer commandBuffer,
7910 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
7911 {
7912 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7913 RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
7914 struct radeon_cmdbuf *cs = cmd_buffer->cs;
7915 unsigned pred_op = PREDICATION_OP_BOOL32;
7916 bool draw_visible = true;
7917 uint64_t va;
7918
7919 va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
7920
7921 /* By default, if the 32-bit value at offset in buffer memory is zero,
7922 * then the rendering commands are discarded, otherwise they are
7923 * executed as normal. If the inverted flag is set, all commands are
7924 * discarded if the value is non zero.
7925 */
7926 if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
7927 draw_visible = false;
7928 }
7929
7930 si_emit_cache_flush(cmd_buffer);
7931
7932 if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL &&
7933 !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
7934 uint64_t pred_value = 0, pred_va;
7935 unsigned pred_offset;
7936
7937 /* From the Vulkan spec 1.1.107:
7938 *
7939 * "If the 32-bit value at offset in buffer memory is zero,
7940 * then the rendering commands are discarded, otherwise they
7941 * are executed as normal. If the value of the predicate in
7942 * buffer memory changes while conditional rendering is
7943 * active, the rendering commands may be discarded in an
7944 * implementation-dependent way. Some implementations may
7945 * latch the value of the predicate upon beginning conditional
7946 * rendering while others may read it before every rendering
7947 * command."
7948 *
7949 * But, the AMD hardware treats the predicate as a 64-bit
7950 * value which means we need a workaround in the driver.
7951 * Luckily, it's not required to support if the value changes
7952 * when predication is active.
7953 *
7954 * The workaround is as follows:
7955 * 1) allocate a 64-value in the upload BO and initialize it
7956 * to 0
7957 * 2) copy the 32-bit predicate value to the upload BO
7958 * 3) use the new allocated VA address for predication
7959 *
7960 * Based on the conditionalrender demo, it's faster to do the
7961 * COPY_DATA in ME (+ sync PFP) instead of PFP.
7962 */
7963 radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
7964
7965 pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
7966
7967 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
7968 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7969 COPY_DATA_WR_CONFIRM);
7970 radeon_emit(cs, va);
7971 radeon_emit(cs, va >> 32);
7972 radeon_emit(cs, pred_va);
7973 radeon_emit(cs, pred_va >> 32);
7974
7975 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
7976 radeon_emit(cs, 0);
7977
7978 va = pred_va;
7979 pred_op = PREDICATION_OP_BOOL64;
7980 }
7981
7982 /* Enable predication for this command buffer. */
7983 si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
7984 cmd_buffer->state.predicating = true;
7985
7986 /* Store conditional rendering user info. */
7987 cmd_buffer->state.predication_type = draw_visible;
7988 cmd_buffer->state.predication_op = pred_op;
7989 cmd_buffer->state.predication_va = va;
7990 }
7991
7992 void
radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)7993 radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
7994 {
7995 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7996
7997 /* Disable predication for this command buffer. */
7998 si_emit_set_predication_state(cmd_buffer, false, 0, 0);
7999 cmd_buffer->state.predicating = false;
8000
8001 /* Reset conditional rendering user info. */
8002 cmd_buffer->state.predication_type = -1;
8003 cmd_buffer->state.predication_op = 0;
8004 cmd_buffer->state.predication_va = 0;
8005 }
8006
8007 /* VK_EXT_transform_feedback */
8008 void
radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)8009 radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
8010 uint32_t bindingCount, const VkBuffer *pBuffers,
8011 const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes)
8012 {
8013 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8014 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
8015 uint8_t enabled_mask = 0;
8016
8017 assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
8018 for (uint32_t i = 0; i < bindingCount; i++) {
8019 uint32_t idx = firstBinding + i;
8020
8021 sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
8022 sb[idx].offset = pOffsets[i];
8023
8024 if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
8025 sb[idx].size = sb[idx].buffer->size - sb[idx].offset;
8026 } else {
8027 sb[idx].size = pSizes[i];
8028 }
8029
8030 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
8031
8032 enabled_mask |= 1 << idx;
8033 }
8034
8035 cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
8036
8037 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
8038 }
8039
8040 static void
radv_emit_streamout_enable(struct radv_cmd_buffer * cmd_buffer)8041 radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
8042 {
8043 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8044 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8045
8046 radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
8047 radeon_emit(cs, S_028B94_STREAMOUT_0_EN(so->streamout_enabled) | S_028B94_RAST_STREAM(0) |
8048 S_028B94_STREAMOUT_1_EN(so->streamout_enabled) |
8049 S_028B94_STREAMOUT_2_EN(so->streamout_enabled) |
8050 S_028B94_STREAMOUT_3_EN(so->streamout_enabled));
8051 radeon_emit(cs, so->hw_enabled_mask & so->enabled_stream_buffers_mask);
8052
8053 cmd_buffer->state.context_roll_without_scissor_emitted = true;
8054 }
8055
8056 static void
radv_set_streamout_enable(struct radv_cmd_buffer * cmd_buffer,bool enable)8057 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
8058 {
8059 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8060 bool old_streamout_enabled = so->streamout_enabled;
8061 uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
8062
8063 so->streamout_enabled = enable;
8064
8065 so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) |
8066 (so->enabled_mask << 12);
8067
8068 if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
8069 ((old_streamout_enabled != so->streamout_enabled) ||
8070 (old_hw_enabled_mask != so->hw_enabled_mask)))
8071 radv_emit_streamout_enable(cmd_buffer);
8072
8073 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
8074 cmd_buffer->gds_needed = true;
8075 cmd_buffer->gds_oa_needed = true;
8076 }
8077 }
8078
8079 static void
radv_flush_vgt_streamout(struct radv_cmd_buffer * cmd_buffer)8080 radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
8081 {
8082 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8083 unsigned reg_strmout_cntl;
8084
8085 /* The register is at different places on different ASICs. */
8086 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
8087 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
8088 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
8089 } else {
8090 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
8091 radeon_set_config_reg(cs, reg_strmout_cntl, 0);
8092 }
8093
8094 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
8095 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
8096
8097 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
8098 radeon_emit(cs,
8099 WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
8100 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
8101 radeon_emit(cs, 0);
8102 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
8103 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
8104 radeon_emit(cs, 4); /* poll interval */
8105 }
8106
8107 static void
radv_emit_streamout_begin(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8108 radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
8109 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8110 const VkDeviceSize *pCounterBufferOffsets)
8111
8112 {
8113 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
8114 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8115 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8116
8117 radv_flush_vgt_streamout(cmd_buffer);
8118
8119 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
8120 u_foreach_bit(i, so->enabled_mask)
8121 {
8122 int32_t counter_buffer_idx = i - firstCounterBuffer;
8123 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
8124 counter_buffer_idx = -1;
8125
8126 /* AMD GCN binds streamout buffers as shader resources.
8127 * VGT only counts primitives and tells the shader through
8128 * SGPRs what to do.
8129 */
8130 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
8131 radeon_emit(cs, sb[i].size >> 2); /* BUFFER_SIZE (in DW) */
8132 radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */
8133
8134 cmd_buffer->state.context_roll_without_scissor_emitted = true;
8135
8136 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
8137 /* The array of counter buffers is optional. */
8138 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
8139 uint64_t va = radv_buffer_get_va(buffer->bo);
8140 uint64_t counter_buffer_offset = 0;
8141
8142 if (pCounterBufferOffsets)
8143 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
8144
8145 va += buffer->offset + counter_buffer_offset;
8146
8147 /* Append */
8148 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
8149 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
8150 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
8151 radeon_emit(cs, 0); /* unused */
8152 radeon_emit(cs, 0); /* unused */
8153 radeon_emit(cs, va); /* src address lo */
8154 radeon_emit(cs, va >> 32); /* src address hi */
8155
8156 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
8157 } else {
8158 /* Start from the beginning. */
8159 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
8160 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
8161 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
8162 radeon_emit(cs, 0); /* unused */
8163 radeon_emit(cs, 0); /* unused */
8164 radeon_emit(cs, 0); /* unused */
8165 radeon_emit(cs, 0); /* unused */
8166 }
8167 }
8168
8169 radv_set_streamout_enable(cmd_buffer, true);
8170 }
8171
8172 static void
gfx10_emit_streamout_begin(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8173 gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
8174 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8175 const VkDeviceSize *pCounterBufferOffsets)
8176 {
8177 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8178 unsigned last_target = util_last_bit(so->enabled_mask) - 1;
8179 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8180
8181 assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
8182 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
8183
8184 /* Sync because the next streamout operation will overwrite GDS and we
8185 * have to make sure it's idle.
8186 * TODO: Improve by tracking if there is a streamout operation in
8187 * flight.
8188 */
8189 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
8190 si_emit_cache_flush(cmd_buffer);
8191
8192 u_foreach_bit(i, so->enabled_mask)
8193 {
8194 int32_t counter_buffer_idx = i - firstCounterBuffer;
8195 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
8196 counter_buffer_idx = -1;
8197
8198 bool append =
8199 counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
8200 uint64_t va = 0;
8201
8202 if (append) {
8203 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
8204 uint64_t counter_buffer_offset = 0;
8205
8206 if (pCounterBufferOffsets)
8207 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
8208
8209 va += radv_buffer_get_va(buffer->bo);
8210 va += buffer->offset + counter_buffer_offset;
8211
8212 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
8213 }
8214
8215 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
8216 radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
8217 S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
8218 radeon_emit(cs, va);
8219 radeon_emit(cs, va >> 32);
8220 radeon_emit(cs, 4 * i); /* destination in GDS */
8221 radeon_emit(cs, 0);
8222 radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
8223 }
8224
8225 radv_set_streamout_enable(cmd_buffer, true);
8226 }
8227
8228 void
radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8229 radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
8230 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8231 const VkDeviceSize *pCounterBufferOffsets)
8232 {
8233 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8234
8235 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
8236 gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount,
8237 pCounterBuffers, pCounterBufferOffsets);
8238 } else {
8239 radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
8240 pCounterBufferOffsets);
8241 }
8242 }
8243
8244 static void
radv_emit_streamout_end(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8245 radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
8246 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8247 const VkDeviceSize *pCounterBufferOffsets)
8248 {
8249 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8250 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8251
8252 radv_flush_vgt_streamout(cmd_buffer);
8253
8254 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
8255 u_foreach_bit(i, so->enabled_mask)
8256 {
8257 int32_t counter_buffer_idx = i - firstCounterBuffer;
8258 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
8259 counter_buffer_idx = -1;
8260
8261 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
8262 /* The array of counters buffer is optional. */
8263 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
8264 uint64_t va = radv_buffer_get_va(buffer->bo);
8265 uint64_t counter_buffer_offset = 0;
8266
8267 if (pCounterBufferOffsets)
8268 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
8269
8270 va += buffer->offset + counter_buffer_offset;
8271
8272 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
8273 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
8274 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
8275 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
8276 radeon_emit(cs, va); /* dst address lo */
8277 radeon_emit(cs, va >> 32); /* dst address hi */
8278 radeon_emit(cs, 0); /* unused */
8279 radeon_emit(cs, 0); /* unused */
8280
8281 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
8282 }
8283
8284 /* Deactivate transform feedback by zeroing the buffer size.
8285 * The counters (primitives generated, primitives emitted) may
8286 * be enabled even if there is not buffer bound. This ensures
8287 * that the primitives-emitted query won't increment.
8288 */
8289 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
8290
8291 cmd_buffer->state.context_roll_without_scissor_emitted = true;
8292 }
8293
8294 radv_set_streamout_enable(cmd_buffer, false);
8295 }
8296
8297 static void
gfx10_emit_streamout_end(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8298 gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
8299 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8300 const VkDeviceSize *pCounterBufferOffsets)
8301 {
8302 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8303 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8304
8305 assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
8306 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
8307
8308 u_foreach_bit(i, so->enabled_mask)
8309 {
8310 int32_t counter_buffer_idx = i - firstCounterBuffer;
8311 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
8312 counter_buffer_idx = -1;
8313
8314 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
8315 /* The array of counters buffer is optional. */
8316 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
8317 uint64_t va = radv_buffer_get_va(buffer->bo);
8318 uint64_t counter_buffer_offset = 0;
8319
8320 if (pCounterBufferOffsets)
8321 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
8322
8323 va += buffer->offset + counter_buffer_offset;
8324
8325 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
8326 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0,
8327 EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0);
8328
8329 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
8330 }
8331 }
8332
8333 radv_set_streamout_enable(cmd_buffer, false);
8334 }
8335
8336 void
radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8337 radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
8338 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8339 const VkDeviceSize *pCounterBufferOffsets)
8340 {
8341 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8342
8343 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
8344 gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
8345 pCounterBufferOffsets);
8346 } else {
8347 radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
8348 pCounterBufferOffsets);
8349 }
8350 }
8351
8352 void
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer _counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)8353 radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount,
8354 uint32_t firstInstance, VkBuffer _counterBuffer,
8355 VkDeviceSize counterBufferOffset, uint32_t counterOffset,
8356 uint32_t vertexStride)
8357 {
8358 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8359 RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
8360 struct radv_draw_info info;
8361
8362 info.count = 0;
8363 info.instance_count = instanceCount;
8364 info.first_instance = firstInstance;
8365 info.strmout_buffer = counterBuffer;
8366 info.strmout_buffer_offset = counterBufferOffset;
8367 info.stride = vertexStride;
8368 info.indexed = false;
8369 info.indirect = NULL;
8370
8371 if (!radv_before_draw(cmd_buffer, &info, 1))
8372 return;
8373 struct VkMultiDrawInfoEXT minfo = { 0, 0 };
8374 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
8375 radv_after_draw(cmd_buffer);
8376 }
8377
8378 /* VK_AMD_buffer_marker */
8379 void
radv_CmdWriteBufferMarkerAMD(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits pipelineStage,VkBuffer dstBuffer,VkDeviceSize dstOffset,uint32_t marker)8380 radv_CmdWriteBufferMarkerAMD(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage,
8381 VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker)
8382 {
8383 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8384 RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
8385 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8386 uint64_t va = radv_buffer_get_va(buffer->bo) + dstOffset;
8387
8388 si_emit_cache_flush(cmd_buffer);
8389
8390 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
8391
8392 if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
8393 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
8394 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
8395 COPY_DATA_WR_CONFIRM);
8396 radeon_emit(cs, marker);
8397 radeon_emit(cs, 0);
8398 radeon_emit(cs, va);
8399 radeon_emit(cs, va >> 32);
8400 } else {
8401 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
8402 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
8403 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker,
8404 cmd_buffer->gfx9_eop_bug_va);
8405 }
8406
8407 assert(cmd_buffer->cs->cdw <= cdw_max);
8408 }
8409