1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include "radv_cs.h"
29 #include "radv_debug.h"
30 #include "radv_meta.h"
31 #include "radv_private.h"
32 #include "radv_radeon_winsys.h"
33 #include "radv_shader.h"
34 #include "sid.h"
35 #include "vk_format.h"
36 #include "vk_util.h"
37 
38 #include "ac_debug.h"
39 
40 #include "util/fast_idiv_by_const.h"
41 
42 enum {
43    RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
44    RADV_PREFETCH_VS = (1 << 1),
45    RADV_PREFETCH_TCS = (1 << 2),
46    RADV_PREFETCH_TES = (1 << 3),
47    RADV_PREFETCH_GS = (1 << 4),
48    RADV_PREFETCH_PS = (1 << 5),
49    RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES |
50                             RADV_PREFETCH_GS | RADV_PREFETCH_PS)
51 };
52 
53 enum {
54    RADV_RT_STAGE_BITS = (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
55                          VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR |
56                          VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR)
57 };
58 
59 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
60                                          struct radv_image *image, VkImageLayout src_layout,
61                                          bool src_render_loop, VkImageLayout dst_layout,
62                                          bool dst_render_loop, uint32_t src_family,
63                                          uint32_t dst_family, const VkImageSubresourceRange *range,
64                                          struct radv_sample_locations_state *sample_locs);
65 
66 static void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size);
67 
68 const struct radv_dynamic_state default_dynamic_state = {
69    .viewport =
70       {
71          .count = 0,
72       },
73    .scissor =
74       {
75          .count = 0,
76       },
77    .line_width = 1.0f,
78    .depth_bias =
79       {
80          .bias = 0.0f,
81          .clamp = 0.0f,
82          .slope = 0.0f,
83       },
84    .blend_constants = {0.0f, 0.0f, 0.0f, 0.0f},
85    .depth_bounds =
86       {
87          .min = 0.0f,
88          .max = 1.0f,
89       },
90    .stencil_compare_mask =
91       {
92          .front = ~0u,
93          .back = ~0u,
94       },
95    .stencil_write_mask =
96       {
97          .front = ~0u,
98          .back = ~0u,
99       },
100    .stencil_reference =
101       {
102          .front = 0u,
103          .back = 0u,
104       },
105    .line_stipple =
106       {
107          .factor = 0u,
108          .pattern = 0u,
109       },
110    .cull_mode = 0u,
111    .front_face = 0u,
112    .primitive_topology = 0u,
113    .fragment_shading_rate =
114       {
115          .size = {1u, 1u},
116          .combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR,
117                           VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR},
118       },
119    .depth_bias_enable = 0u,
120    .primitive_restart_enable = 0u,
121    .rasterizer_discard_enable = 0u,
122    .logic_op = 0u,
123    .color_write_enable = 0xffffffffu,
124 };
125 
126 static void
radv_bind_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_dynamic_state * src)127 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
128 {
129    struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
130    uint64_t copy_mask = src->mask;
131    uint64_t dest_mask = 0;
132 
133    dest->discard_rectangle.count = src->discard_rectangle.count;
134    dest->sample_location.count = src->sample_location.count;
135 
136    if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
137       if (dest->viewport.count != src->viewport.count) {
138          dest->viewport.count = src->viewport.count;
139          dest_mask |= RADV_DYNAMIC_VIEWPORT;
140       }
141 
142       if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
143                  src->viewport.count * sizeof(VkViewport))) {
144          typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count);
145          typed_memcpy(dest->viewport.xform, src->viewport.xform, src->viewport.count);
146          dest_mask |= RADV_DYNAMIC_VIEWPORT;
147       }
148    }
149 
150    if (copy_mask & RADV_DYNAMIC_SCISSOR) {
151       if (dest->scissor.count != src->scissor.count) {
152          dest->scissor.count = src->scissor.count;
153          dest_mask |= RADV_DYNAMIC_SCISSOR;
154       }
155 
156       if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
157                  src->scissor.count * sizeof(VkRect2D))) {
158          typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count);
159          dest_mask |= RADV_DYNAMIC_SCISSOR;
160       }
161    }
162 
163    if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
164       if (dest->line_width != src->line_width) {
165          dest->line_width = src->line_width;
166          dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
167       }
168    }
169 
170    if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
171       if (memcmp(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias))) {
172          dest->depth_bias = src->depth_bias;
173          dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
174       }
175    }
176 
177    if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
178       if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) {
179          typed_memcpy(dest->blend_constants, src->blend_constants, 4);
180          dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
181       }
182    }
183 
184    if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
185       if (memcmp(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds))) {
186          dest->depth_bounds = src->depth_bounds;
187          dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
188       }
189    }
190 
191    if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
192       if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
193                  sizeof(src->stencil_compare_mask))) {
194          dest->stencil_compare_mask = src->stencil_compare_mask;
195          dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
196       }
197    }
198 
199    if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
200       if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
201                  sizeof(src->stencil_write_mask))) {
202          dest->stencil_write_mask = src->stencil_write_mask;
203          dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
204       }
205    }
206 
207    if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
208       if (memcmp(&dest->stencil_reference, &src->stencil_reference,
209                  sizeof(src->stencil_reference))) {
210          dest->stencil_reference = src->stencil_reference;
211          dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
212       }
213    }
214 
215    if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
216       if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
217                  src->discard_rectangle.count * sizeof(VkRect2D))) {
218          typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles,
219                       src->discard_rectangle.count);
220          dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
221       }
222    }
223 
224    if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
225       if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
226           dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
227           dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
228           memcmp(&dest->sample_location.locations, &src->sample_location.locations,
229                  src->sample_location.count * sizeof(VkSampleLocationEXT))) {
230          dest->sample_location.per_pixel = src->sample_location.per_pixel;
231          dest->sample_location.grid_size = src->sample_location.grid_size;
232          typed_memcpy(dest->sample_location.locations, src->sample_location.locations,
233                       src->sample_location.count);
234          dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
235       }
236    }
237 
238    if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) {
239       if (memcmp(&dest->line_stipple, &src->line_stipple, sizeof(src->line_stipple))) {
240          dest->line_stipple = src->line_stipple;
241          dest_mask |= RADV_DYNAMIC_LINE_STIPPLE;
242       }
243    }
244 
245    if (copy_mask & RADV_DYNAMIC_CULL_MODE) {
246       if (dest->cull_mode != src->cull_mode) {
247          dest->cull_mode = src->cull_mode;
248          dest_mask |= RADV_DYNAMIC_CULL_MODE;
249       }
250    }
251 
252    if (copy_mask & RADV_DYNAMIC_FRONT_FACE) {
253       if (dest->front_face != src->front_face) {
254          dest->front_face = src->front_face;
255          dest_mask |= RADV_DYNAMIC_FRONT_FACE;
256       }
257    }
258 
259    if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
260       if (dest->primitive_topology != src->primitive_topology) {
261          dest->primitive_topology = src->primitive_topology;
262          dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
263       }
264    }
265 
266    if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
267       if (dest->depth_test_enable != src->depth_test_enable) {
268          dest->depth_test_enable = src->depth_test_enable;
269          dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE;
270       }
271    }
272 
273    if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
274       if (dest->depth_write_enable != src->depth_write_enable) {
275          dest->depth_write_enable = src->depth_write_enable;
276          dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
277       }
278    }
279 
280    if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
281       if (dest->depth_compare_op != src->depth_compare_op) {
282          dest->depth_compare_op = src->depth_compare_op;
283          dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP;
284       }
285    }
286 
287    if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
288       if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) {
289          dest->depth_bounds_test_enable = src->depth_bounds_test_enable;
290          dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
291       }
292    }
293 
294    if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
295       if (dest->stencil_test_enable != src->stencil_test_enable) {
296          dest->stencil_test_enable = src->stencil_test_enable;
297          dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE;
298       }
299    }
300 
301    if (copy_mask & RADV_DYNAMIC_STENCIL_OP) {
302       if (memcmp(&dest->stencil_op, &src->stencil_op, sizeof(src->stencil_op))) {
303          dest->stencil_op = src->stencil_op;
304          dest_mask |= RADV_DYNAMIC_STENCIL_OP;
305       }
306    }
307 
308    if (copy_mask & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
309       if (memcmp(&dest->fragment_shading_rate, &src->fragment_shading_rate,
310                  sizeof(src->fragment_shading_rate))) {
311          dest->fragment_shading_rate = src->fragment_shading_rate;
312          dest_mask |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
313       }
314    }
315 
316    if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
317       if (dest->depth_bias_enable != src->depth_bias_enable) {
318          dest->depth_bias_enable = src->depth_bias_enable;
319          dest_mask |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
320       }
321    }
322 
323    if (copy_mask & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
324       if (dest->primitive_restart_enable != src->primitive_restart_enable) {
325          dest->primitive_restart_enable = src->primitive_restart_enable;
326          dest_mask |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
327       }
328    }
329 
330    if (copy_mask & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
331       if (dest->rasterizer_discard_enable != src->rasterizer_discard_enable) {
332          dest->rasterizer_discard_enable = src->rasterizer_discard_enable;
333          dest_mask |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
334       }
335    }
336 
337    if (copy_mask & RADV_DYNAMIC_LOGIC_OP) {
338       if (dest->logic_op != src->logic_op) {
339          dest->logic_op = src->logic_op;
340          dest_mask |= RADV_DYNAMIC_LOGIC_OP;
341       }
342    }
343 
344    if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
345       if (dest->color_write_enable != src->color_write_enable) {
346          dest->color_write_enable = src->color_write_enable;
347          dest_mask |= RADV_DYNAMIC_COLOR_WRITE_ENABLE;
348       }
349    }
350 
351    cmd_buffer->state.dirty |= dest_mask;
352 }
353 
354 static void
radv_bind_streamout_state(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)355 radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
356 {
357    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
358    struct radv_shader_info *info;
359 
360    if (!pipeline->streamout_shader || cmd_buffer->device->physical_device->use_ngg_streamout)
361       return;
362 
363    info = &pipeline->streamout_shader->info;
364    for (int i = 0; i < MAX_SO_BUFFERS; i++)
365       so->stride_in_dw[i] = info->so.strides[i];
366 
367    so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
368 }
369 
370 bool
radv_cmd_buffer_uses_mec(struct radv_cmd_buffer * cmd_buffer)371 radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
372 {
373    return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
374           cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
375 }
376 
377 enum ring_type
radv_queue_family_to_ring(int f)378 radv_queue_family_to_ring(int f)
379 {
380    switch (f) {
381    case RADV_QUEUE_GENERAL:
382       return RING_GFX;
383    case RADV_QUEUE_COMPUTE:
384       return RING_COMPUTE;
385    case RADV_QUEUE_TRANSFER:
386       return RING_DMA;
387    default:
388       unreachable("Unknown queue family");
389    }
390 }
391 
392 static void
radv_emit_write_data_packet(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned count,const uint32_t * data)393 radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
394                             unsigned count, const uint32_t *data)
395 {
396    struct radeon_cmdbuf *cs = cmd_buffer->cs;
397 
398    radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
399 
400    radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
401    radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel));
402    radeon_emit(cs, va);
403    radeon_emit(cs, va >> 32);
404    radeon_emit_array(cs, data, count);
405 }
406 
407 static void
radv_emit_clear_data(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned size)408 radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
409                      unsigned size)
410 {
411    uint32_t *zeroes = alloca(size);
412    memset(zeroes, 0, size);
413    radv_emit_write_data_packet(cmd_buffer, engine_sel, va, size / 4, zeroes);
414 }
415 
416 static void
radv_destroy_cmd_buffer(struct radv_cmd_buffer * cmd_buffer)417 radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
418 {
419    list_del(&cmd_buffer->pool_link);
420 
421    list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
422    {
423       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
424       list_del(&up->list);
425       free(up);
426    }
427 
428    if (cmd_buffer->upload.upload_bo)
429       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo);
430 
431    if (cmd_buffer->cs)
432       cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
433 
434    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
435       free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
436       vk_object_base_finish(&cmd_buffer->descriptors[i].push_set.set.base);
437    }
438 
439    vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
440 
441    vk_command_buffer_finish(&cmd_buffer->vk);
442    vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
443 }
444 
445 static VkResult
radv_create_cmd_buffer(struct radv_device * device,struct radv_cmd_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)446 radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
447                        VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer)
448 {
449    struct radv_cmd_buffer *cmd_buffer;
450    unsigned ring;
451    cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
452    if (cmd_buffer == NULL)
453       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
454 
455    VkResult result =
456       vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
457    if (result != VK_SUCCESS) {
458       vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
459       return result;
460    }
461 
462    cmd_buffer->device = device;
463    cmd_buffer->pool = pool;
464    cmd_buffer->level = level;
465 
466    list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
467    cmd_buffer->queue_family_index = pool->queue_family_index;
468 
469    ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
470 
471    cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
472    if (!cmd_buffer->cs) {
473       radv_destroy_cmd_buffer(cmd_buffer);
474       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
475    }
476 
477    vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base,
478                        VK_OBJECT_TYPE_DESCRIPTOR_SET);
479 
480    for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
481       vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base,
482                           VK_OBJECT_TYPE_DESCRIPTOR_SET);
483 
484    *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
485 
486    list_inithead(&cmd_buffer->upload.list);
487 
488    return VK_SUCCESS;
489 }
490 
491 static VkResult
radv_reset_cmd_buffer(struct radv_cmd_buffer * cmd_buffer)492 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
493 {
494    vk_command_buffer_reset(&cmd_buffer->vk);
495 
496    cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
497 
498    list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
499    {
500       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
501       list_del(&up->list);
502       free(up);
503    }
504 
505    cmd_buffer->push_constant_stages = 0;
506    cmd_buffer->scratch_size_per_wave_needed = 0;
507    cmd_buffer->scratch_waves_wanted = 0;
508    cmd_buffer->compute_scratch_size_per_wave_needed = 0;
509    cmd_buffer->compute_scratch_waves_wanted = 0;
510    cmd_buffer->esgs_ring_size_needed = 0;
511    cmd_buffer->gsvs_ring_size_needed = 0;
512    cmd_buffer->tess_rings_needed = false;
513    cmd_buffer->gds_needed = false;
514    cmd_buffer->gds_oa_needed = false;
515    cmd_buffer->sample_positions_needed = false;
516 
517    if (cmd_buffer->upload.upload_bo)
518       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
519    cmd_buffer->upload.offset = 0;
520 
521    cmd_buffer->record_result = VK_SUCCESS;
522 
523    memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
524 
525    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
526       cmd_buffer->descriptors[i].dirty = 0;
527       cmd_buffer->descriptors[i].valid = 0;
528       cmd_buffer->descriptors[i].push_dirty = false;
529    }
530 
531    if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
532        cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
533       unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
534       unsigned fence_offset, eop_bug_offset;
535       void *fence_ptr;
536 
537       radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
538       memset(fence_ptr, 0, 8);
539 
540       cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
541       cmd_buffer->gfx9_fence_va += fence_offset;
542 
543       radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
544 
545       if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
546          /* Allocate a buffer for the EOP bug on GFX9. */
547          radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
548          memset(fence_ptr, 0, 16 * num_db);
549          cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
550          cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
551 
552          radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
553       }
554    }
555 
556    cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
557 
558    return cmd_buffer->record_result;
559 }
560 
561 static bool
radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer * cmd_buffer,uint64_t min_needed)562 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
563 {
564    uint64_t new_size;
565    struct radeon_winsys_bo *bo = NULL;
566    struct radv_cmd_buffer_upload *upload;
567    struct radv_device *device = cmd_buffer->device;
568 
569    new_size = MAX2(min_needed, 16 * 1024);
570    new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
571 
572    VkResult result =
573       device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws),
574                                 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
575                                    RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
576                                 RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo);
577 
578    if (result != VK_SUCCESS) {
579       cmd_buffer->record_result = result;
580       return false;
581    }
582 
583    radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
584    if (cmd_buffer->upload.upload_bo) {
585       upload = malloc(sizeof(*upload));
586 
587       if (!upload) {
588          cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
589          device->ws->buffer_destroy(device->ws, bo);
590          return false;
591       }
592 
593       memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
594       list_add(&upload->list, &cmd_buffer->upload.list);
595    }
596 
597    cmd_buffer->upload.upload_bo = bo;
598    cmd_buffer->upload.size = new_size;
599    cmd_buffer->upload.offset = 0;
600    cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
601 
602    if (!cmd_buffer->upload.map) {
603       cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
604       return false;
605    }
606 
607    return true;
608 }
609 
610 bool
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned * out_offset,void ** ptr)611 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
612                              unsigned *out_offset, void **ptr)
613 {
614    assert(size % 4 == 0);
615 
616    struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
617 
618    /* Align to the scalar cache line size if it results in this allocation
619     * being placed in less of them.
620     */
621    unsigned offset = cmd_buffer->upload.offset;
622    unsigned line_size = rad_info->chip_class >= GFX10 ? 64 : 32;
623    unsigned gap = align(offset, line_size) - offset;
624    if ((size & (line_size - 1)) > gap)
625       offset = align(offset, line_size);
626 
627    if (offset + size > cmd_buffer->upload.size) {
628       if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
629          return false;
630       offset = 0;
631    }
632 
633    *out_offset = offset;
634    *ptr = cmd_buffer->upload.map + offset;
635 
636    cmd_buffer->upload.offset = offset + size;
637    return true;
638 }
639 
640 bool
radv_cmd_buffer_upload_data(struct radv_cmd_buffer * cmd_buffer,unsigned size,const void * data,unsigned * out_offset)641 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data,
642                             unsigned *out_offset)
643 {
644    uint8_t *ptr;
645 
646    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
647       return false;
648 
649    if (ptr)
650       memcpy(ptr, data, size);
651 
652    return true;
653 }
654 
655 void
radv_cmd_buffer_trace_emit(struct radv_cmd_buffer * cmd_buffer)656 radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
657 {
658    struct radv_device *device = cmd_buffer->device;
659    struct radeon_cmdbuf *cs = cmd_buffer->cs;
660    uint64_t va;
661 
662    va = radv_buffer_get_va(device->trace_bo);
663    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
664       va += 4;
665 
666    ++cmd_buffer->state.trace_id;
667    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id);
668 
669    radeon_check_space(cmd_buffer->device->ws, cs, 2);
670 
671    radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
672    radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
673 }
674 
675 static void
radv_cmd_buffer_after_draw(struct radv_cmd_buffer * cmd_buffer,enum radv_cmd_flush_bits flags)676 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags)
677 {
678    if (unlikely(cmd_buffer->device->thread_trace.bo)) {
679       radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
680       radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
681    }
682 
683    if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
684       enum rgp_flush_bits sqtt_flush_bits = 0;
685       assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
686 
687       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
688 
689       /* Force wait for graphics or compute engines to be idle. */
690       si_cs_emit_cache_flush(cmd_buffer->cs,
691                              cmd_buffer->device->physical_device->rad_info.chip_class,
692                              &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va,
693                              radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits,
694                              cmd_buffer->gfx9_eop_bug_va);
695    }
696 
697    if (unlikely(cmd_buffer->device->trace_bo))
698       radv_cmd_buffer_trace_emit(cmd_buffer);
699 }
700 
701 static void
radv_save_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)702 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
703 {
704    struct radv_device *device = cmd_buffer->device;
705    enum ring_type ring;
706    uint32_t data[2];
707    uint64_t va;
708 
709    va = radv_buffer_get_va(device->trace_bo);
710 
711    ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
712 
713    switch (ring) {
714    case RING_GFX:
715       va += 8;
716       break;
717    case RING_COMPUTE:
718       va += 16;
719       break;
720    default:
721       assert(!"invalid ring type");
722    }
723 
724    uint64_t pipeline_address = (uintptr_t)pipeline;
725    data[0] = pipeline_address;
726    data[1] = pipeline_address >> 32;
727 
728    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
729 }
730 
731 static void
radv_save_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,uint64_t vb_ptr)732 radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
733 {
734    struct radv_device *device = cmd_buffer->device;
735    uint32_t data[2];
736    uint64_t va;
737 
738    va = radv_buffer_get_va(device->trace_bo);
739    va += 24;
740 
741    data[0] = vb_ptr;
742    data[1] = vb_ptr >> 32;
743 
744    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
745 }
746 
747 void
radv_set_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)748 radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
749                         struct radv_descriptor_set *set, unsigned idx)
750 {
751    struct radv_descriptor_state *descriptors_state =
752       radv_get_descriptors_state(cmd_buffer, bind_point);
753 
754    descriptors_state->sets[idx] = set;
755 
756    descriptors_state->valid |= (1u << idx); /* active descriptors */
757    descriptors_state->dirty |= (1u << idx);
758 }
759 
760 static void
radv_save_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)761 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
762 {
763    struct radv_descriptor_state *descriptors_state =
764       radv_get_descriptors_state(cmd_buffer, bind_point);
765    struct radv_device *device = cmd_buffer->device;
766    uint32_t data[MAX_SETS * 2] = {0};
767    uint64_t va;
768    va = radv_buffer_get_va(device->trace_bo) + 32;
769 
770    u_foreach_bit(i, descriptors_state->valid)
771    {
772       struct radv_descriptor_set *set = descriptors_state->sets[i];
773       data[i * 2] = (uint64_t)(uintptr_t)set;
774       data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
775    }
776 
777    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data);
778 }
779 
780 struct radv_userdata_info *
radv_lookup_user_sgpr(struct radv_pipeline * pipeline,gl_shader_stage stage,int idx)781 radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx)
782 {
783    struct radv_shader_variant *shader = radv_get_shader(pipeline, stage);
784    return &shader->info.user_sgprs_locs.shader_data[idx];
785 }
786 
787 static void
radv_emit_userdata_address(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,gl_shader_stage stage,int idx,uint64_t va)788 radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
789                            gl_shader_stage stage, int idx, uint64_t va)
790 {
791    struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
792    uint32_t base_reg = pipeline->user_data_0[stage];
793    if (loc->sgpr_idx == -1)
794       return;
795 
796    assert(loc->num_sgprs == 1);
797 
798    radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
799                             false);
800 }
801 
802 static void
radv_emit_descriptor_pointers(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,struct radv_descriptor_state * descriptors_state,gl_shader_stage stage)803 radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
804                               struct radv_descriptor_state *descriptors_state,
805                               gl_shader_stage stage)
806 {
807    struct radv_device *device = cmd_buffer->device;
808    struct radeon_cmdbuf *cs = cmd_buffer->cs;
809    uint32_t sh_base = pipeline->user_data_0[stage];
810    struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs;
811    unsigned mask = locs->descriptor_sets_enabled;
812 
813    mask &= descriptors_state->dirty & descriptors_state->valid;
814 
815    while (mask) {
816       int start, count;
817 
818       u_bit_scan_consecutive_range(&mask, &start, &count);
819 
820       struct radv_userdata_info *loc = &locs->descriptor_sets[start];
821       unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
822 
823       radv_emit_shader_pointer_head(cs, sh_offset, count, true);
824       for (int i = 0; i < count; i++) {
825          struct radv_descriptor_set *set = descriptors_state->sets[start + i];
826 
827          radv_emit_shader_pointer_body(device, cs, set->header.va, true);
828       }
829    }
830 }
831 
832 /**
833  * Convert the user sample locations to hardware sample locations (the values
834  * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
835  */
836 static void
radv_convert_user_sample_locs(struct radv_sample_locations_state * state,uint32_t x,uint32_t y,VkOffset2D * sample_locs)837 radv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
838                               VkOffset2D *sample_locs)
839 {
840    uint32_t x_offset = x % state->grid_size.width;
841    uint32_t y_offset = y % state->grid_size.height;
842    uint32_t num_samples = (uint32_t)state->per_pixel;
843    VkSampleLocationEXT *user_locs;
844    uint32_t pixel_offset;
845 
846    pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
847 
848    assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
849    user_locs = &state->locations[pixel_offset];
850 
851    for (uint32_t i = 0; i < num_samples; i++) {
852       float shifted_pos_x = user_locs[i].x - 0.5;
853       float shifted_pos_y = user_locs[i].y - 0.5;
854 
855       int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
856       int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
857 
858       sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
859       sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
860    }
861 }
862 
863 /**
864  * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
865  * locations.
866  */
867 static void
radv_compute_sample_locs_pixel(uint32_t num_samples,VkOffset2D * sample_locs,uint32_t * sample_locs_pixel)868 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
869                                uint32_t *sample_locs_pixel)
870 {
871    for (uint32_t i = 0; i < num_samples; i++) {
872       uint32_t sample_reg_idx = i / 4;
873       uint32_t sample_loc_idx = i % 4;
874       int32_t pos_x = sample_locs[i].x;
875       int32_t pos_y = sample_locs[i].y;
876 
877       uint32_t shift_x = 8 * sample_loc_idx;
878       uint32_t shift_y = shift_x + 4;
879 
880       sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
881       sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
882    }
883 }
884 
885 /**
886  * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
887  * sample locations.
888  */
889 static uint64_t
radv_compute_centroid_priority(struct radv_cmd_buffer * cmd_buffer,VkOffset2D * sample_locs,uint32_t num_samples)890 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs,
891                                uint32_t num_samples)
892 {
893    uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
894    uint32_t sample_mask = num_samples - 1;
895    uint32_t *distances = alloca(num_samples * sizeof(*distances));
896    uint64_t centroid_priority = 0;
897 
898    /* Compute the distances from center for each sample. */
899    for (int i = 0; i < num_samples; i++) {
900       distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
901    }
902 
903    /* Compute the centroid priorities by looking at the distances array. */
904    for (int i = 0; i < num_samples; i++) {
905       uint32_t min_idx = 0;
906 
907       for (int j = 1; j < num_samples; j++) {
908          if (distances[j] < distances[min_idx])
909             min_idx = j;
910       }
911 
912       centroid_priorities[i] = min_idx;
913       distances[min_idx] = 0xffffffff;
914    }
915 
916    /* Compute the final centroid priority. */
917    for (int i = 0; i < 8; i++) {
918       centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
919    }
920 
921    return centroid_priority << 32 | centroid_priority;
922 }
923 
924 /**
925  * Emit the sample locations that are specified with VK_EXT_sample_locations.
926  */
927 static void
radv_emit_sample_locations(struct radv_cmd_buffer * cmd_buffer)928 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
929 {
930    struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location;
931    uint32_t num_samples = (uint32_t)sample_location->per_pixel;
932    struct radeon_cmdbuf *cs = cmd_buffer->cs;
933    uint32_t sample_locs_pixel[4][2] = {0};
934    VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
935    uint32_t max_sample_dist = 0;
936    uint64_t centroid_priority;
937 
938    if (!cmd_buffer->state.dynamic.sample_location.count)
939       return;
940 
941    /* Convert the user sample locations to hardware sample locations. */
942    radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
943    radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
944    radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
945    radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
946 
947    /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
948    for (uint32_t i = 0; i < 4; i++) {
949       radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
950    }
951 
952    /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
953    centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
954 
955    /* Compute the maximum sample distance from the specified locations. */
956    for (unsigned i = 0; i < 4; ++i) {
957       for (uint32_t j = 0; j < num_samples; j++) {
958          VkOffset2D offset = sample_locs[i][j];
959          max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
960       }
961    }
962 
963    /* Emit the specified user sample locations. */
964    switch (num_samples) {
965    case 2:
966    case 4:
967       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
968                              sample_locs_pixel[0][0]);
969       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
970                              sample_locs_pixel[1][0]);
971       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
972                              sample_locs_pixel[2][0]);
973       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
974                              sample_locs_pixel[3][0]);
975       break;
976    case 8:
977       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
978                              sample_locs_pixel[0][0]);
979       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
980                              sample_locs_pixel[1][0]);
981       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
982                              sample_locs_pixel[2][0]);
983       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
984                              sample_locs_pixel[3][0]);
985       radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1,
986                              sample_locs_pixel[0][1]);
987       radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1,
988                              sample_locs_pixel[1][1]);
989       radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1,
990                              sample_locs_pixel[2][1]);
991       radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1,
992                              sample_locs_pixel[3][1]);
993       break;
994    default:
995       unreachable("invalid number of samples");
996    }
997 
998    /* Emit the maximum sample distance and the centroid priority. */
999    radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG,
1000                               S_028BE0_MAX_SAMPLE_DIST(max_sample_dist), ~C_028BE0_MAX_SAMPLE_DIST);
1001 
1002    radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1003    radeon_emit(cs, centroid_priority);
1004    radeon_emit(cs, centroid_priority >> 32);
1005 
1006    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1007 }
1008 
1009 static void
radv_emit_inline_push_consts(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,gl_shader_stage stage,int idx,uint32_t * values)1010 radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
1011                              gl_shader_stage stage, int idx, uint32_t *values)
1012 {
1013    struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
1014    uint32_t base_reg = pipeline->user_data_0[stage];
1015    if (loc->sgpr_idx == -1)
1016       return;
1017 
1018    radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 2 + loc->num_sgprs);
1019 
1020    radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
1021    radeon_emit_array(cmd_buffer->cs, values, loc->num_sgprs);
1022 }
1023 
1024 static void
radv_update_multisample_state(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)1025 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
1026 {
1027    int num_samples = pipeline->graphics.ms.num_samples;
1028    struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
1029 
1030    if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions)
1031       cmd_buffer->sample_positions_needed = true;
1032 
1033    if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
1034       return;
1035 
1036    radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
1037 
1038    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1039 }
1040 
1041 static void
radv_update_binning_state(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)1042 radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
1043 {
1044    const struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
1045 
1046    if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
1047       return;
1048 
1049    if (old_pipeline &&
1050        old_pipeline->graphics.binning.pa_sc_binner_cntl_0 ==
1051           pipeline->graphics.binning.pa_sc_binner_cntl_0)
1052       return;
1053 
1054    bool binning_flush = false;
1055    if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
1056        cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
1057        cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
1058        cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1059       binning_flush = !old_pipeline ||
1060                       G_028C44_BINNING_MODE(old_pipeline->graphics.binning.pa_sc_binner_cntl_0) !=
1061                          G_028C44_BINNING_MODE(pipeline->graphics.binning.pa_sc_binner_cntl_0);
1062    }
1063 
1064    radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
1065                           pipeline->graphics.binning.pa_sc_binner_cntl_0 |
1066                              S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
1067 
1068    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1069 }
1070 
1071 static void
radv_emit_shader_prefetch(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_variant * shader)1072 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *shader)
1073 {
1074    uint64_t va;
1075 
1076    if (!shader)
1077       return;
1078 
1079    va = radv_shader_variant_get_va(shader);
1080 
1081    si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1082 }
1083 
1084 static void
radv_emit_prefetch_L2(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,bool vertex_stage_only)1085 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
1086                       bool vertex_stage_only)
1087 {
1088    struct radv_cmd_state *state = &cmd_buffer->state;
1089    uint32_t mask = state->prefetch_L2_mask;
1090 
1091    if (vertex_stage_only) {
1092       /* Fast prefetch path for starting draws as soon as possible.
1093        */
1094       mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS);
1095    }
1096 
1097    if (mask & RADV_PREFETCH_VS)
1098       radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_VERTEX]);
1099 
1100    if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1101       si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size);
1102 
1103    if (mask & RADV_PREFETCH_TCS)
1104       radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_CTRL]);
1105 
1106    if (mask & RADV_PREFETCH_TES)
1107       radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_EVAL]);
1108 
1109    if (mask & RADV_PREFETCH_GS) {
1110       radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_GEOMETRY]);
1111       if (radv_pipeline_has_gs_copy_shader(pipeline))
1112          radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader);
1113    }
1114 
1115    if (mask & RADV_PREFETCH_PS)
1116       radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_FRAGMENT]);
1117 
1118    state->prefetch_L2_mask &= ~mask;
1119 }
1120 
1121 static void
radv_emit_rbplus_state(struct radv_cmd_buffer * cmd_buffer)1122 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1123 {
1124    if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
1125       return;
1126 
1127    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1128    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1129 
1130    unsigned sx_ps_downconvert = 0;
1131    unsigned sx_blend_opt_epsilon = 0;
1132    unsigned sx_blend_opt_control = 0;
1133 
1134    if (!cmd_buffer->state.attachments || !subpass)
1135       return;
1136 
1137    for (unsigned i = 0; i < subpass->color_count; ++i) {
1138       if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1139          /* We don't set the DISABLE bits, because the HW can't have holes,
1140           * so the SPI color format is set to 32-bit 1-component. */
1141          sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1142          continue;
1143       }
1144 
1145       int idx = subpass->color_attachments[i].attachment;
1146       struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
1147 
1148       unsigned format = G_028C70_FORMAT(cb->cb_color_info);
1149       unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1150       uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf;
1151       uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf;
1152 
1153       bool has_alpha, has_rgb;
1154 
1155       /* Set if RGB and A are present. */
1156       has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib);
1157 
1158       if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1159          has_rgb = !has_alpha;
1160       else
1161          has_rgb = true;
1162 
1163       /* Check the colormask and export format. */
1164       if (!(colormask & 0x7))
1165          has_rgb = false;
1166       if (!(colormask & 0x8))
1167          has_alpha = false;
1168 
1169       if (spi_format == V_028714_SPI_SHADER_ZERO) {
1170          has_rgb = false;
1171          has_alpha = false;
1172       }
1173 
1174       /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1175        * optimization, even though it has no alpha. */
1176       if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1177          has_alpha = true;
1178 
1179       /* Disable value checking for disabled channels. */
1180       if (!has_rgb)
1181          sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1182       if (!has_alpha)
1183          sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1184 
1185       /* Enable down-conversion for 32bpp and smaller formats. */
1186       switch (format) {
1187       case V_028C70_COLOR_8:
1188       case V_028C70_COLOR_8_8:
1189       case V_028C70_COLOR_8_8_8_8:
1190          /* For 1 and 2-channel formats, use the superset thereof. */
1191          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1192              spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1193              spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1194             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1195             sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1196          }
1197          break;
1198 
1199       case V_028C70_COLOR_5_6_5:
1200          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1201             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1202             sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1203          }
1204          break;
1205 
1206       case V_028C70_COLOR_1_5_5_5:
1207          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1208             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1209             sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1210          }
1211          break;
1212 
1213       case V_028C70_COLOR_4_4_4_4:
1214          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1215             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1216             sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1217          }
1218          break;
1219 
1220       case V_028C70_COLOR_32:
1221          if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1222             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1223          else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1224             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1225          break;
1226 
1227       case V_028C70_COLOR_16:
1228       case V_028C70_COLOR_16_16:
1229          /* For 1-channel formats, use the superset thereof. */
1230          if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1231              spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1232              spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1233              spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1234             if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1235                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1236             else
1237                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1238          }
1239          break;
1240 
1241       case V_028C70_COLOR_10_11_11:
1242          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1243             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1244          break;
1245 
1246       case V_028C70_COLOR_2_10_10_10:
1247          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1248             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1249             sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1250          }
1251          break;
1252       case V_028C70_COLOR_5_9_9_9:
1253          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1254             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1255          break;
1256       }
1257    }
1258 
1259    /* Do not set the DISABLE bits for the unused attachments, as that
1260     * breaks dual source blending in SkQP and does not seem to improve
1261     * performance. */
1262 
1263    if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert &&
1264        sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon &&
1265        sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control)
1266       return;
1267 
1268    radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1269    radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1270    radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1271    radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1272 
1273    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1274 
1275    cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
1276    cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
1277    cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
1278 }
1279 
1280 static void
radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer * cmd_buffer)1281 radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer)
1282 {
1283    if (!cmd_buffer->device->pbb_allowed)
1284       return;
1285 
1286    struct radv_binning_settings settings =
1287       radv_get_binning_settings(cmd_buffer->device->physical_device);
1288    bool break_for_new_ps =
1289       (!cmd_buffer->state.emitted_pipeline ||
1290        cmd_buffer->state.emitted_pipeline->shaders[MESA_SHADER_FRAGMENT] !=
1291           cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) &&
1292       (settings.context_states_per_bin > 1 || settings.persistent_states_per_bin > 1);
1293    bool break_for_new_cb_target_mask =
1294       (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) &&
1295       settings.context_states_per_bin > 1;
1296 
1297    if (!break_for_new_ps && !break_for_new_cb_target_mask)
1298       return;
1299 
1300    radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1301    radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1302 }
1303 
1304 static void
radv_emit_graphics_pipeline(struct radv_cmd_buffer * cmd_buffer)1305 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1306 {
1307    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1308 
1309    if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
1310       return;
1311 
1312    radv_update_multisample_state(cmd_buffer, pipeline);
1313    radv_update_binning_state(cmd_buffer, pipeline);
1314 
1315    cmd_buffer->scratch_size_per_wave_needed =
1316       MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave);
1317    cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->max_waves);
1318 
1319    if (!cmd_buffer->state.emitted_pipeline ||
1320        cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
1321           pipeline->graphics.can_use_guardband)
1322       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
1323 
1324    if (!cmd_buffer->state.emitted_pipeline ||
1325        cmd_buffer->state.emitted_pipeline->graphics.pa_su_sc_mode_cntl !=
1326           pipeline->graphics.pa_su_sc_mode_cntl)
1327       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE |
1328                                  RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
1329                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
1330 
1331    if (!cmd_buffer->state.emitted_pipeline ||
1332        cmd_buffer->state.emitted_pipeline->graphics.pa_cl_clip_cntl !=
1333           pipeline->graphics.pa_cl_clip_cntl)
1334       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1335 
1336    if (!cmd_buffer->state.emitted_pipeline ||
1337        cmd_buffer->state.emitted_pipeline->graphics.cb_color_control !=
1338        pipeline->graphics.cb_color_control)
1339       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
1340 
1341    if (!cmd_buffer->state.emitted_pipeline)
1342       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
1343                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
1344                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |
1345                                  RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
1346 
1347    if (!cmd_buffer->state.emitted_pipeline ||
1348        cmd_buffer->state.emitted_pipeline->graphics.db_depth_control !=
1349           pipeline->graphics.db_depth_control)
1350       cmd_buffer->state.dirty |=
1351          RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
1352          RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
1353          RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1354 
1355    if (!cmd_buffer->state.emitted_pipeline)
1356       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1357 
1358    if (!cmd_buffer->state.emitted_pipeline ||
1359        cmd_buffer->state.emitted_pipeline->graphics.cb_target_mask !=
1360        pipeline->graphics.cb_target_mask) {
1361       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
1362    }
1363 
1364    radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
1365 
1366    if (pipeline->graphics.has_ngg_culling &&
1367        pipeline->graphics.last_vgt_api_stage != MESA_SHADER_GEOMETRY &&
1368        !cmd_buffer->state.last_nggc_settings) {
1369       /* The already emitted RSRC2 contains the LDS required for NGG culling.
1370        * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage.
1371        * API GS always needs LDS, so this isn't useful there.
1372        */
1373       struct radv_shader_variant *v = pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
1374       radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
1375                         (v->config.rsrc2 & C_00B22C_LDS_SIZE) |
1376                         S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling));
1377    }
1378 
1379    if (!cmd_buffer->state.emitted_pipeline ||
1380        cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
1381        cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
1382        memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, pipeline->ctx_cs.buf,
1383               pipeline->ctx_cs.cdw * 4)) {
1384       radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw);
1385       cmd_buffer->state.context_roll_without_scissor_emitted = true;
1386    }
1387 
1388    radv_emit_batch_break_on_new_ps(cmd_buffer);
1389 
1390    for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
1391       if (!pipeline->shaders[i])
1392          continue;
1393 
1394       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->shaders[i]->bo);
1395    }
1396 
1397    if (radv_pipeline_has_gs_copy_shader(pipeline))
1398       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->gs_copy_shader->bo);
1399 
1400    if (unlikely(cmd_buffer->device->trace_bo))
1401       radv_save_pipeline(cmd_buffer, pipeline);
1402 
1403    cmd_buffer->state.emitted_pipeline = pipeline;
1404 
1405    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1406 }
1407 
1408 static void
radv_emit_viewport(struct radv_cmd_buffer * cmd_buffer)1409 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1410 {
1411    const struct radv_viewport_state *viewport = &cmd_buffer->state.dynamic.viewport;
1412    int i;
1413    const unsigned count = viewport->count;
1414 
1415    assert(count);
1416    radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, count * 6);
1417 
1418    for (i = 0; i < count; i++) {
1419       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[0]));
1420       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[0]));
1421       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[1]));
1422       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[1]));
1423       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[2]));
1424       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[2]));
1425    }
1426 
1427    radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, count * 2);
1428    for (i = 0; i < count; i++) {
1429       float zmin = MIN2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1430       float zmax = MAX2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1431       radeon_emit(cmd_buffer->cs, fui(zmin));
1432       radeon_emit(cmd_buffer->cs, fui(zmax));
1433    }
1434 }
1435 
1436 static void
radv_emit_scissor(struct radv_cmd_buffer * cmd_buffer)1437 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1438 {
1439    uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1440 
1441    si_write_scissors(cmd_buffer->cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors,
1442                      cmd_buffer->state.dynamic.viewport.viewports,
1443                      cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
1444 
1445    cmd_buffer->state.context_roll_without_scissor_emitted = false;
1446 }
1447 
1448 static void
radv_emit_discard_rectangle(struct radv_cmd_buffer * cmd_buffer)1449 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1450 {
1451    if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1452       return;
1453 
1454    radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1455                               cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1456    for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1457       VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1458       radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1459       radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1460                                      S_028214_BR_Y(rect.offset.y + rect.extent.height));
1461    }
1462 }
1463 
1464 static void
radv_emit_line_width(struct radv_cmd_buffer * cmd_buffer)1465 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1466 {
1467    unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1468 
1469    radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1470                           S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF)));
1471 }
1472 
1473 static void
radv_emit_blend_constants(struct radv_cmd_buffer * cmd_buffer)1474 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1475 {
1476    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1477 
1478    radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1479    radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1480 }
1481 
1482 static void
radv_emit_stencil(struct radv_cmd_buffer * cmd_buffer)1483 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1484 {
1485    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1486 
1487    radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
1488    radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1489                                   S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1490                                   S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1491                                   S_028430_STENCILOPVAL(1));
1492    radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1493                                   S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1494                                   S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1495                                   S_028434_STENCILOPVAL_BF(1));
1496 }
1497 
1498 static void
radv_emit_depth_bounds(struct radv_cmd_buffer * cmd_buffer)1499 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1500 {
1501    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1502 
1503    radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
1504    radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.min));
1505    radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.max));
1506 }
1507 
1508 static void
radv_emit_depth_bias(struct radv_cmd_buffer * cmd_buffer)1509 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1510 {
1511    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1512    unsigned slope = fui(d->depth_bias.slope * 16.0f);
1513 
1514    radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1515    radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1516    radeon_emit(cmd_buffer->cs, slope);                    /* FRONT SCALE */
1517    radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias));  /* FRONT OFFSET */
1518    radeon_emit(cmd_buffer->cs, slope);                    /* BACK SCALE */
1519    radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias));  /* BACK OFFSET */
1520 }
1521 
1522 static void
radv_emit_line_stipple(struct radv_cmd_buffer * cmd_buffer)1523 radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
1524 {
1525    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1526    uint32_t auto_reset_cntl = 1;
1527 
1528    if (d->primitive_topology == V_008958_DI_PT_LINESTRIP)
1529       auto_reset_cntl = 2;
1530 
1531    radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
1532                           S_028A0C_LINE_PATTERN(d->line_stipple.pattern) |
1533                              S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) |
1534                              S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
1535 }
1536 
1537 static void
radv_emit_culling(struct radv_cmd_buffer * cmd_buffer,uint64_t states)1538 radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1539 {
1540    unsigned pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
1541    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1542 
1543    pa_su_sc_mode_cntl &= C_028814_CULL_FRONT &
1544                          C_028814_CULL_BACK &
1545                          C_028814_FACE &
1546                          C_028814_POLY_OFFSET_FRONT_ENABLE &
1547                          C_028814_POLY_OFFSET_BACK_ENABLE &
1548                          C_028814_POLY_OFFSET_PARA_ENABLE;
1549 
1550    pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) |
1551                          S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) |
1552                          S_028814_FACE(d->front_face) |
1553                          S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) |
1554                          S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) |
1555                          S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable);
1556 
1557    radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
1558 }
1559 
1560 static void
radv_emit_primitive_topology(struct radv_cmd_buffer * cmd_buffer)1561 radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
1562 {
1563    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1564 
1565    if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
1566       radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs,
1567                                  R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology);
1568    } else {
1569       radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology);
1570    }
1571 }
1572 
1573 static void
radv_emit_depth_control(struct radv_cmd_buffer * cmd_buffer,uint64_t states)1574 radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1575 {
1576    unsigned db_depth_control = cmd_buffer->state.pipeline->graphics.db_depth_control;
1577    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1578 
1579    db_depth_control &= C_028800_Z_ENABLE &
1580                        C_028800_Z_WRITE_ENABLE &
1581                        C_028800_ZFUNC &
1582                        C_028800_DEPTH_BOUNDS_ENABLE &
1583                        C_028800_STENCIL_ENABLE &
1584                        C_028800_BACKFACE_ENABLE &
1585                        C_028800_STENCILFUNC &
1586                        C_028800_STENCILFUNC_BF;
1587 
1588    db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) |
1589                        S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) |
1590                        S_028800_ZFUNC(d->depth_compare_op) |
1591                        S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) |
1592                        S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) |
1593                        S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) |
1594                        S_028800_STENCILFUNC(d->stencil_op.front.compare_op) |
1595                        S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op);
1596 
1597    radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
1598 }
1599 
1600 static void
radv_emit_stencil_control(struct radv_cmd_buffer * cmd_buffer)1601 radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
1602 {
1603    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1604 
1605    radeon_set_context_reg(
1606       cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
1607       S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) |
1608          S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) |
1609          S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) |
1610          S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) |
1611          S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) |
1612          S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op)));
1613 }
1614 
1615 static void
radv_emit_fragment_shading_rate(struct radv_cmd_buffer * cmd_buffer)1616 radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
1617 {
1618    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1619    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1620    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1621    uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1;
1622    uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1;
1623    uint32_t pa_cl_vrs_cntl = pipeline->graphics.vrs.pa_cl_vrs_cntl;
1624    uint32_t vertex_comb_mode = d->fragment_shading_rate.combiner_ops[0];
1625    uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1];
1626 
1627    if (subpass && !subpass->vrs_attachment) {
1628       /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
1629        * can cheat by tweaking the different combiner modes.
1630        */
1631       switch (htile_comb_mode) {
1632       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
1633          /* The result of min(A, 1x1) is always 1x1. */
1634          FALLTHROUGH;
1635       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
1636          /* Force the per-draw VRS rate to 1x1. */
1637          rate_x = rate_y = 0;
1638 
1639          /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
1640           * combiner mode as passthrough.
1641           */
1642          vertex_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU;
1643          break;
1644       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
1645          /* The result of max(A, 1x1) is always A. */
1646          FALLTHROUGH;
1647       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
1648          /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
1649          break;
1650       default:
1651          break;
1652       }
1653    }
1654 
1655    /* Emit per-draw VRS rate which is the first combiner. */
1656    radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE,
1657                           S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
1658 
1659    /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
1660     * draw rate and the vertex rate.
1661     */
1662    pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(vertex_comb_mode);
1663 
1664    /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
1665     * rate.
1666     */
1667    pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
1668 
1669    radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
1670 }
1671 
1672 static void
radv_emit_primitive_restart_enable(struct radv_cmd_buffer * cmd_buffer)1673 radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
1674 {
1675    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1676 
1677    if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1678       radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
1679                              d->primitive_restart_enable);
1680    } else {
1681       radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
1682                              d->primitive_restart_enable);
1683    }
1684 }
1685 
1686 static void
radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer * cmd_buffer)1687 radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer)
1688 {
1689    unsigned pa_cl_clip_cntl = cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl;
1690    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1691 
1692    pa_cl_clip_cntl &= C_028810_DX_RASTERIZATION_KILL;
1693    pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable);
1694 
1695    radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl);
1696 }
1697 
1698 static void
radv_emit_logic_op(struct radv_cmd_buffer * cmd_buffer)1699 radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
1700 {
1701    unsigned cb_color_control = cmd_buffer->state.pipeline->graphics.cb_color_control;
1702    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1703 
1704    cb_color_control &= C_028808_ROP3;
1705    cb_color_control |= S_028808_ROP3(d->logic_op);
1706 
1707    radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
1708 }
1709 
1710 static void
radv_emit_color_write_enable(struct radv_cmd_buffer * cmd_buffer)1711 radv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer)
1712 {
1713    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1714    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1715 
1716    radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK,
1717                           pipeline->graphics.cb_target_mask & d->color_write_enable);
1718 }
1719 
1720 static void
radv_emit_fb_color_state(struct radv_cmd_buffer * cmd_buffer,int index,struct radv_color_buffer_info * cb,struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop,bool disable_dcc)1721 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index,
1722                          struct radv_color_buffer_info *cb, struct radv_image_view *iview,
1723                          VkImageLayout layout, bool in_render_loop, bool disable_dcc)
1724 {
1725    bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8;
1726    uint32_t cb_color_info = cb->cb_color_info;
1727    struct radv_image *image = iview->image;
1728 
1729    if (!radv_layout_dcc_compressed(
1730           cmd_buffer->device, image, iview->base_mip, layout, in_render_loop,
1731           radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1732                                        cmd_buffer->queue_family_index)) ||
1733        disable_dcc) {
1734       cb_color_info &= C_028C70_DCC_ENABLE;
1735    }
1736 
1737    if (!radv_layout_fmask_compressed(
1738           cmd_buffer->device, image, layout,
1739           radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1740                                        cmd_buffer->queue_family_index))) {
1741       cb_color_info &= C_028C70_COMPRESSION;
1742    }
1743 
1744    if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
1745                                                 radv_is_dcc_decompress_pipeline(cmd_buffer))) {
1746       /* If this bit is set, the FMASK decompression operation
1747        * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
1748        */
1749       cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
1750    }
1751 
1752    if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1753       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1754       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1755       radeon_emit(cmd_buffer->cs, 0);
1756       radeon_emit(cmd_buffer->cs, 0);
1757       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1758       radeon_emit(cmd_buffer->cs, cb_color_info);
1759       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1760       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1761       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1762       radeon_emit(cmd_buffer->cs, 0);
1763       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1764       radeon_emit(cmd_buffer->cs, 0);
1765 
1766       radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
1767 
1768       radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
1769                              cb->cb_color_base >> 32);
1770       radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
1771                              cb->cb_color_cmask >> 32);
1772       radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
1773                              cb->cb_color_fmask >> 32);
1774       radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
1775                              cb->cb_dcc_base >> 32);
1776       radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
1777                              cb->cb_color_attrib2);
1778       radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
1779                              cb->cb_color_attrib3);
1780    } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1781       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1782       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1783       radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
1784       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
1785       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1786       radeon_emit(cmd_buffer->cs, cb_color_info);
1787       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1788       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1789       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1790       radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
1791       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1792       radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
1793 
1794       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
1795       radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1796       radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
1797 
1798       radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
1799                              cb->cb_mrt_epitch);
1800    } else {
1801       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1802       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1803       radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
1804       radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
1805       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1806       radeon_emit(cmd_buffer->cs, cb_color_info);
1807       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1808       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1809       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1810       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
1811       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1812       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
1813 
1814       if (is_vi) { /* DCC BASE */
1815          radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c,
1816                                 cb->cb_dcc_base);
1817       }
1818    }
1819 
1820    if (G_028C70_DCC_ENABLE(cb_color_info)) {
1821       /* Drawing with DCC enabled also compresses colorbuffers. */
1822       VkImageSubresourceRange range = {
1823          .aspectMask = iview->aspect_mask,
1824          .baseMipLevel = iview->base_mip,
1825          .levelCount = iview->level_count,
1826          .baseArrayLayer = iview->base_layer,
1827          .layerCount = iview->layer_count,
1828       };
1829 
1830       radv_update_dcc_metadata(cmd_buffer, image, &range, true);
1831    }
1832 }
1833 
1834 static void
radv_update_zrange_precision(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,const struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop,bool requires_cond_exec)1835 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
1836                              const struct radv_image_view *iview, VkImageLayout layout,
1837                              bool in_render_loop, bool requires_cond_exec)
1838 {
1839    const struct radv_image *image = iview->image;
1840    uint32_t db_z_info = ds->db_z_info;
1841    uint32_t db_z_info_reg;
1842 
1843    if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
1844        !radv_image_is_tc_compat_htile(image))
1845       return;
1846 
1847    if (!radv_layout_is_htile_compressed(
1848           cmd_buffer->device, image, layout, in_render_loop,
1849           radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1850                                        cmd_buffer->queue_family_index))) {
1851       db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1852    }
1853 
1854    db_z_info &= C_028040_ZRANGE_PRECISION;
1855 
1856    if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1857       db_z_info_reg = R_028038_DB_Z_INFO;
1858    } else {
1859       db_z_info_reg = R_028040_DB_Z_INFO;
1860    }
1861 
1862    /* When we don't know the last fast clear value we need to emit a
1863     * conditional packet that will eventually skip the following
1864     * SET_CONTEXT_REG packet.
1865     */
1866    if (requires_cond_exec) {
1867       uint64_t va = radv_get_tc_compat_zrange_va(image, iview->base_mip);
1868 
1869       radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
1870       radeon_emit(cmd_buffer->cs, va);
1871       radeon_emit(cmd_buffer->cs, va >> 32);
1872       radeon_emit(cmd_buffer->cs, 0);
1873       radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
1874    }
1875 
1876    radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
1877 }
1878 
1879 static void
radv_emit_fb_ds_state(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop)1880 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
1881                       struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop)
1882 {
1883    const struct radv_image *image = iview->image;
1884    uint32_t db_z_info = ds->db_z_info;
1885    uint32_t db_stencil_info = ds->db_stencil_info;
1886 
1887    if (!radv_layout_is_htile_compressed(
1888           cmd_buffer->device, image, layout, in_render_loop,
1889           radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1890                                        cmd_buffer->queue_family_index))) {
1891       db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1892       db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
1893    }
1894 
1895    radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
1896    radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
1897 
1898    if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1899       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1900       radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
1901 
1902       radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
1903       radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
1904       radeon_emit(cmd_buffer->cs, db_z_info);
1905       radeon_emit(cmd_buffer->cs, db_stencil_info);
1906       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1907       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1908       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1909       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1910 
1911       radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
1912       radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1913       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1914       radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1915       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1916       radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
1917    } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1918       radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
1919       radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
1920       radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
1921       radeon_emit(cmd_buffer->cs, ds->db_depth_size);
1922 
1923       radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
1924       radeon_emit(cmd_buffer->cs, db_z_info);          /* DB_Z_INFO */
1925       radeon_emit(cmd_buffer->cs, db_stencil_info);    /* DB_STENCIL_INFO */
1926       radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
1927       radeon_emit(cmd_buffer->cs,
1928                   S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
1929       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);   /* DB_STENCIL_READ_BASE */
1930       radeon_emit(cmd_buffer->cs,
1931                   S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
1932       radeon_emit(cmd_buffer->cs, ds->db_z_write_base);              /* DB_Z_WRITE_BASE */
1933       radeon_emit(cmd_buffer->cs,
1934                   S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
1935       radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);   /* DB_STENCIL_WRITE_BASE */
1936       radeon_emit(cmd_buffer->cs,
1937                   S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
1938 
1939       radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
1940       radeon_emit(cmd_buffer->cs, ds->db_z_info2);
1941       radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
1942    } else {
1943       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1944 
1945       radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
1946       radeon_emit(cmd_buffer->cs, ds->db_depth_info);         /* R_02803C_DB_DEPTH_INFO */
1947       radeon_emit(cmd_buffer->cs, db_z_info);                 /* R_028040_DB_Z_INFO */
1948       radeon_emit(cmd_buffer->cs, db_stencil_info);           /* R_028044_DB_STENCIL_INFO */
1949       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);        /* R_028048_DB_Z_READ_BASE */
1950       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);  /* R_02804C_DB_STENCIL_READ_BASE */
1951       radeon_emit(cmd_buffer->cs, ds->db_z_write_base);       /* R_028050_DB_Z_WRITE_BASE */
1952       radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
1953       radeon_emit(cmd_buffer->cs, ds->db_depth_size);         /* R_028058_DB_DEPTH_SIZE */
1954       radeon_emit(cmd_buffer->cs, ds->db_depth_slice);        /* R_02805C_DB_DEPTH_SLICE */
1955    }
1956 
1957    /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
1958    radv_update_zrange_precision(cmd_buffer, ds, iview, layout, in_render_loop, true);
1959 
1960    radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
1961                           ds->pa_su_poly_offset_db_fmt_cntl);
1962 }
1963 
1964 /**
1965  * Update the fast clear depth/stencil values if the image is bound as a
1966  * depth/stencil buffer.
1967  */
1968 static void
radv_update_bound_fast_clear_ds(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)1969 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
1970                                 const struct radv_image_view *iview,
1971                                 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
1972 {
1973    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1974    const struct radv_image *image = iview->image;
1975    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1976    uint32_t att_idx;
1977 
1978    if (!cmd_buffer->state.attachments || !subpass)
1979       return;
1980 
1981    if (!subpass->depth_stencil_attachment)
1982       return;
1983 
1984    att_idx = subpass->depth_stencil_attachment->attachment;
1985    if (cmd_buffer->state.attachments[att_idx].iview->image != image)
1986       return;
1987 
1988    if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
1989       radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
1990       radeon_emit(cs, ds_clear_value.stencil);
1991       radeon_emit(cs, fui(ds_clear_value.depth));
1992    } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
1993       radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
1994    } else {
1995       assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
1996       radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
1997    }
1998 
1999    /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
2000     * only needed when clearing Z to 0.0.
2001     */
2002    if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
2003       VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2004       bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2005 
2006       radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview,
2007                                    layout, in_render_loop, false);
2008    }
2009 
2010    cmd_buffer->state.context_roll_without_scissor_emitted = true;
2011 }
2012 
2013 /**
2014  * Set the clear depth/stencil values to the image's metadata.
2015  */
2016 static void
radv_set_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)2017 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2018                            const VkImageSubresourceRange *range,
2019                            VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2020 {
2021    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2022    uint32_t level_count = radv_get_levelCount(image, range);
2023 
2024    if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
2025       uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
2026 
2027       /* Use the fastest way when both aspects are used. */
2028       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
2029       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2030       radeon_emit(cs, va);
2031       radeon_emit(cs, va >> 32);
2032 
2033       for (uint32_t l = 0; l < level_count; l++) {
2034          radeon_emit(cs, ds_clear_value.stencil);
2035          radeon_emit(cs, fui(ds_clear_value.depth));
2036       }
2037    } else {
2038       /* Otherwise we need one WRITE_DATA packet per level. */
2039       for (uint32_t l = 0; l < level_count; l++) {
2040          uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
2041          unsigned value;
2042 
2043          if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
2044             value = fui(ds_clear_value.depth);
2045             va += 4;
2046          } else {
2047             assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
2048             value = ds_clear_value.stencil;
2049          }
2050 
2051          radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
2052          radeon_emit(cs,
2053                      S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2054          radeon_emit(cs, va);
2055          radeon_emit(cs, va >> 32);
2056          radeon_emit(cs, value);
2057       }
2058    }
2059 }
2060 
2061 /**
2062  * Update the TC-compat metadata value for this image.
2063  */
2064 static void
radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)2065 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2066                                    const VkImageSubresourceRange *range, uint32_t value)
2067 {
2068    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2069 
2070    if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
2071       return;
2072 
2073    uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
2074    uint32_t level_count = radv_get_levelCount(image, range);
2075 
2076    radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
2077    radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2078    radeon_emit(cs, va);
2079    radeon_emit(cs, va >> 32);
2080 
2081    for (uint32_t l = 0; l < level_count; l++)
2082       radeon_emit(cs, value);
2083 }
2084 
2085 static void
radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value)2086 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
2087                                       const struct radv_image_view *iview,
2088                                       VkClearDepthStencilValue ds_clear_value)
2089 {
2090    VkImageSubresourceRange range = {
2091       .aspectMask = iview->aspect_mask,
2092       .baseMipLevel = iview->base_mip,
2093       .levelCount = iview->level_count,
2094       .baseArrayLayer = iview->base_layer,
2095       .layerCount = iview->layer_count,
2096    };
2097    uint32_t cond_val;
2098 
2099    /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
2100     * depth clear value is 0.0f.
2101     */
2102    cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
2103 
2104    radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
2105 }
2106 
2107 /**
2108  * Update the clear depth/stencil values for this image.
2109  */
2110 void
radv_update_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)2111 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2112                               const struct radv_image_view *iview,
2113                               VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2114 {
2115    VkImageSubresourceRange range = {
2116       .aspectMask = iview->aspect_mask,
2117       .baseMipLevel = iview->base_mip,
2118       .levelCount = iview->level_count,
2119       .baseArrayLayer = iview->base_layer,
2120       .layerCount = iview->layer_count,
2121    };
2122    struct radv_image *image = iview->image;
2123 
2124    assert(radv_htile_enabled(image, range.baseMipLevel));
2125 
2126    radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
2127 
2128    if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
2129       radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
2130    }
2131 
2132    radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
2133 }
2134 
2135 /**
2136  * Load the clear depth/stencil values from the image's metadata.
2137  */
2138 static void
radv_load_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview)2139 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
2140 {
2141    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2142    const struct radv_image *image = iview->image;
2143    VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
2144    uint64_t va = radv_get_ds_clear_value_va(image, iview->base_mip);
2145    unsigned reg_offset = 0, reg_count = 0;
2146 
2147    assert(radv_image_has_htile(image));
2148 
2149    if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
2150       ++reg_count;
2151    } else {
2152       ++reg_offset;
2153       va += 4;
2154    }
2155    if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2156       ++reg_count;
2157 
2158    uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
2159 
2160    if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2161       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
2162       radeon_emit(cs, va);
2163       radeon_emit(cs, va >> 32);
2164       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2165       radeon_emit(cs, reg_count);
2166    } else {
2167       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2168       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2169                          (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
2170       radeon_emit(cs, va);
2171       radeon_emit(cs, va >> 32);
2172       radeon_emit(cs, reg >> 2);
2173       radeon_emit(cs, 0);
2174 
2175       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
2176       radeon_emit(cs, 0);
2177    }
2178 }
2179 
2180 /*
2181  * With DCC some colors don't require CMASK elimination before being
2182  * used as a texture. This sets a predicate value to determine if the
2183  * cmask eliminate is required.
2184  */
2185 void
radv_update_fce_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)2186 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2187                          const VkImageSubresourceRange *range, bool value)
2188 {
2189    if (!image->fce_pred_offset)
2190       return;
2191 
2192    uint64_t pred_val = value;
2193    uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
2194    uint32_t level_count = radv_get_levelCount(image, range);
2195    uint32_t count = 2 * level_count;
2196 
2197    radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2198    radeon_emit(cmd_buffer->cs,
2199                S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2200    radeon_emit(cmd_buffer->cs, va);
2201    radeon_emit(cmd_buffer->cs, va >> 32);
2202 
2203    for (uint32_t l = 0; l < level_count; l++) {
2204       radeon_emit(cmd_buffer->cs, pred_val);
2205       radeon_emit(cmd_buffer->cs, pred_val >> 32);
2206    }
2207 }
2208 
2209 /**
2210  * Update the DCC predicate to reflect the compression state.
2211  */
2212 void
radv_update_dcc_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)2213 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2214                          const VkImageSubresourceRange *range, bool value)
2215 {
2216    if (image->dcc_pred_offset == 0)
2217       return;
2218 
2219    uint64_t pred_val = value;
2220    uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
2221    uint32_t level_count = radv_get_levelCount(image, range);
2222    uint32_t count = 2 * level_count;
2223 
2224    assert(radv_dcc_enabled(image, range->baseMipLevel));
2225 
2226    radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2227    radeon_emit(cmd_buffer->cs,
2228                S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2229    radeon_emit(cmd_buffer->cs, va);
2230    radeon_emit(cmd_buffer->cs, va >> 32);
2231 
2232    for (uint32_t l = 0; l < level_count; l++) {
2233       radeon_emit(cmd_buffer->cs, pred_val);
2234       radeon_emit(cmd_buffer->cs, pred_val >> 32);
2235    }
2236 }
2237 
2238 /**
2239  * Update the fast clear color values if the image is bound as a color buffer.
2240  */
2241 static void
radv_update_bound_fast_clear_color(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,int cb_idx,uint32_t color_values[2])2242 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2243                                    int cb_idx, uint32_t color_values[2])
2244 {
2245    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2246    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2247    uint32_t att_idx;
2248 
2249    if (!cmd_buffer->state.attachments || !subpass)
2250       return;
2251 
2252    att_idx = subpass->color_attachments[cb_idx].attachment;
2253    if (att_idx == VK_ATTACHMENT_UNUSED)
2254       return;
2255 
2256    if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2257       return;
2258 
2259    radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
2260    radeon_emit(cs, color_values[0]);
2261    radeon_emit(cs, color_values[1]);
2262 
2263    cmd_buffer->state.context_roll_without_scissor_emitted = true;
2264 }
2265 
2266 /**
2267  * Set the clear color values to the image's metadata.
2268  */
2269 static void
radv_set_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t color_values[2])2270 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2271                               const VkImageSubresourceRange *range, uint32_t color_values[2])
2272 {
2273    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2274    uint32_t level_count = radv_get_levelCount(image, range);
2275    uint32_t count = 2 * level_count;
2276 
2277    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
2278 
2279    if (radv_image_has_clear_value(image)) {
2280       uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
2281 
2282       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
2283       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2284       radeon_emit(cs, va);
2285       radeon_emit(cs, va >> 32);
2286 
2287       for (uint32_t l = 0; l < level_count; l++) {
2288          radeon_emit(cs, color_values[0]);
2289          radeon_emit(cs, color_values[1]);
2290       }
2291    } else {
2292       /* Some default value we can set in the update. */
2293       assert(color_values[0] == 0 && color_values[1] == 0);
2294    }
2295 }
2296 
2297 /**
2298  * Update the clear color values for this image.
2299  */
2300 void
radv_update_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,int cb_idx,uint32_t color_values[2])2301 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2302                                  const struct radv_image_view *iview, int cb_idx,
2303                                  uint32_t color_values[2])
2304 {
2305    struct radv_image *image = iview->image;
2306    VkImageSubresourceRange range = {
2307       .aspectMask = iview->aspect_mask,
2308       .baseMipLevel = iview->base_mip,
2309       .levelCount = iview->level_count,
2310       .baseArrayLayer = iview->base_layer,
2311       .layerCount = iview->layer_count,
2312    };
2313 
2314    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->base_mip));
2315 
2316    /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
2317     * mode because the hardware gets the value from the image directly.
2318     */
2319    if (iview->image->support_comp_to_single)
2320       return;
2321 
2322    radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
2323 
2324    radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2325 }
2326 
2327 /**
2328  * Load the clear color values from the image's metadata.
2329  */
2330 static void
radv_load_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * iview,int cb_idx)2331 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview,
2332                                int cb_idx)
2333 {
2334    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2335    struct radv_image *image = iview->image;
2336 
2337    if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->base_mip))
2338       return;
2339 
2340    if (iview->image->support_comp_to_single)
2341       return;
2342 
2343    if (!radv_image_has_clear_value(image)) {
2344       uint32_t color_values[2] = {0, 0};
2345       radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2346       return;
2347    }
2348 
2349    uint64_t va = radv_image_get_fast_clear_va(image, iview->base_mip);
2350    uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
2351 
2352    if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2353       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
2354       radeon_emit(cs, va);
2355       radeon_emit(cs, va >> 32);
2356       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2357       radeon_emit(cs, 2);
2358    } else {
2359       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
2360       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2361                          COPY_DATA_COUNT_SEL);
2362       radeon_emit(cs, va);
2363       radeon_emit(cs, va >> 32);
2364       radeon_emit(cs, reg >> 2);
2365       radeon_emit(cs, 0);
2366 
2367       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
2368       radeon_emit(cs, 0);
2369    }
2370 }
2371 
2372 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
2373  * broken if the CB caches data of multiple mips of the same image at the
2374  * same time.
2375  *
2376  * Insert some flushes to avoid this.
2377  */
2378 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)2379 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
2380 {
2381    struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2382    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2383    bool color_mip_changed = false;
2384 
2385    /* Entire workaround is not applicable before GFX9 */
2386    if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
2387       return;
2388 
2389    if (!framebuffer)
2390       return;
2391 
2392    for (int i = 0; i < subpass->color_count; ++i) {
2393       int idx = subpass->color_attachments[i].attachment;
2394       if (idx == VK_ATTACHMENT_UNUSED)
2395          continue;
2396 
2397       struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2398 
2399       if ((radv_image_has_CB_metadata(iview->image) ||
2400            radv_dcc_enabled(iview->image, iview->base_mip) ||
2401            radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
2402           cmd_buffer->state.cb_mip[i] != iview->base_mip)
2403          color_mip_changed = true;
2404 
2405       cmd_buffer->state.cb_mip[i] = iview->base_mip;
2406    }
2407 
2408    if (color_mip_changed) {
2409       cmd_buffer->state.flush_bits |=
2410          RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2411    }
2412 }
2413 
2414 /* This function does the flushes for mip changes if the levels are not zero for
2415  * all render targets. This way we can assume at the start of the next cmd_buffer
2416  * that rendering to mip 0 doesn't need any flushes. As that is the most common
2417  * case that saves some flushes. */
2418 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)2419 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
2420 {
2421    /* Entire workaround is not applicable before GFX9 */
2422    if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
2423       return;
2424 
2425    bool need_color_mip_flush = false;
2426    for (unsigned i = 0; i < 8; ++i) {
2427       if (cmd_buffer->state.cb_mip[i]) {
2428          need_color_mip_flush = true;
2429          break;
2430       }
2431    }
2432 
2433    if (need_color_mip_flush) {
2434       cmd_buffer->state.flush_bits |=
2435          RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2436    }
2437 
2438    memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
2439 }
2440 
2441 static struct radv_image *
radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer * cmd_buffer)2442 radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
2443 {
2444    struct radv_device *device = cmd_buffer->device;
2445 
2446    if (!device->vrs.image) {
2447       VkResult result;
2448 
2449       /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
2450       result = radv_device_init_vrs_state(device);
2451       if (result != VK_SUCCESS) {
2452          cmd_buffer->record_result = result;
2453          return NULL;
2454       }
2455    }
2456 
2457    return device->vrs.image;
2458 }
2459 
2460 static void
radv_emit_framebuffer_state(struct radv_cmd_buffer * cmd_buffer)2461 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
2462 {
2463    int i;
2464    struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2465    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2466 
2467    /* this may happen for inherited secondary recording */
2468    if (!framebuffer)
2469       return;
2470 
2471    for (i = 0; i < 8; ++i) {
2472       if (i >= subpass->color_count ||
2473           subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
2474          radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
2475                                 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
2476          continue;
2477       }
2478 
2479       int idx = subpass->color_attachments[i].attachment;
2480       struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2481       VkImageLayout layout = subpass->color_attachments[i].layout;
2482       bool in_render_loop = subpass->color_attachments[i].in_render_loop;
2483 
2484       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bo);
2485 
2486       assert(iview->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
2487                                    VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
2488       radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout,
2489                                in_render_loop, cmd_buffer->state.attachments[idx].disable_dcc);
2490 
2491       radv_load_color_clear_metadata(cmd_buffer, iview, i);
2492    }
2493 
2494    if (subpass->depth_stencil_attachment) {
2495       int idx = subpass->depth_stencil_attachment->attachment;
2496       VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2497       bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2498       struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2499       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2500                          cmd_buffer->state.attachments[idx].iview->image->bo);
2501 
2502       radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout,
2503                             in_render_loop);
2504 
2505       if (radv_layout_is_htile_compressed(
2506              cmd_buffer->device, iview->image, layout, in_render_loop,
2507              radv_image_queue_family_mask(iview->image, cmd_buffer->queue_family_index,
2508                                           cmd_buffer->queue_family_index))) {
2509          /* Only load the depth/stencil fast clear values when
2510           * compressed rendering is enabled.
2511           */
2512          radv_load_ds_clear_metadata(cmd_buffer, iview);
2513       }
2514    } else if (subpass->vrs_attachment && cmd_buffer->device->vrs.image) {
2515       /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
2516        * bind our internal depth buffer that contains the VRS data as part of HTILE.
2517        */
2518       VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
2519       struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
2520       struct radv_image *image = cmd_buffer->device->vrs.image;
2521       struct radv_ds_buffer_info ds;
2522       struct radv_image_view iview;
2523 
2524       radv_image_view_init(&iview, cmd_buffer->device,
2525                            &(VkImageViewCreateInfo){
2526                               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2527                               .image = radv_image_to_handle(image),
2528                               .viewType = radv_meta_get_view_type(image),
2529                               .format = image->vk_format,
2530                               .subresourceRange =
2531                                  {
2532                                     .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
2533                                     .baseMipLevel = 0,
2534                                     .levelCount = 1,
2535                                     .baseArrayLayer = 0,
2536                                     .layerCount = 1,
2537                                  },
2538                            },
2539                            NULL);
2540 
2541       radv_initialise_vrs_surface(image, htile_buffer, &ds);
2542 
2543       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo);
2544 
2545       radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false);
2546 
2547       radv_image_view_finish(&iview);
2548    } else {
2549       if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9)
2550          radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
2551       else
2552          radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
2553 
2554       radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID));       /* DB_Z_INFO */
2555       radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
2556    }
2557    radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
2558                           S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height));
2559 
2560    if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) {
2561       bool disable_constant_encode =
2562          cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
2563       enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
2564       uint8_t watermark = chip_class >= GFX10 ? 6 : 4;
2565 
2566       radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
2567                              S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(chip_class <= GFX9) |
2568                                 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
2569                                 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
2570    }
2571 
2572    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
2573 }
2574 
2575 static void
radv_emit_index_buffer(struct radv_cmd_buffer * cmd_buffer,bool indirect)2576 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
2577 {
2578    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2579    struct radv_cmd_state *state = &cmd_buffer->state;
2580 
2581    if (state->index_type != state->last_index_type) {
2582       if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
2583          radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
2584                                     R_03090C_VGT_INDEX_TYPE, 2, state->index_type);
2585       } else {
2586          radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
2587          radeon_emit(cs, state->index_type);
2588       }
2589 
2590       state->last_index_type = state->index_type;
2591    }
2592 
2593    /* For the direct indexed draws we use DRAW_INDEX_2, which includes
2594     * the index_va and max_index_count already. */
2595    if (!indirect)
2596       return;
2597 
2598    radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
2599    radeon_emit(cs, state->index_va);
2600    radeon_emit(cs, state->index_va >> 32);
2601 
2602    radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
2603    radeon_emit(cs, state->max_index_count);
2604 
2605    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
2606 }
2607 
2608 void
radv_set_db_count_control(struct radv_cmd_buffer * cmd_buffer)2609 radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
2610 {
2611    bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
2612    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2613    uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
2614    uint32_t db_count_control;
2615 
2616    if (!cmd_buffer->state.active_occlusion_queries) {
2617       if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2618          if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2619              pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2620             /* Re-enable out-of-order rasterization if the
2621              * bound pipeline supports it and if it's has
2622              * been disabled before starting any perfect
2623              * occlusion queries.
2624              */
2625             radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2626          }
2627       }
2628       db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
2629    } else {
2630       const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2631       uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
2632       bool gfx10_perfect =
2633          cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10 && has_perfect_queries;
2634 
2635       if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2636          /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
2637           * covered tiles, discards, and early depth testing. For more details,
2638           * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
2639          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
2640                             S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
2641                             S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) |
2642                             S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
2643 
2644          if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2645              pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2646             /* If the bound pipeline has enabled
2647              * out-of-order rasterization, we should
2648              * disable it before starting any perfect
2649              * occlusion queries.
2650              */
2651             pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
2652 
2653             radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2654          }
2655       } else {
2656          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate);
2657       }
2658    }
2659 
2660    radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
2661 
2662    cmd_buffer->state.context_roll_without_scissor_emitted = true;
2663 }
2664 
2665 unsigned
radv_instance_rate_prolog_index(unsigned num_attributes,uint32_t instance_rate_inputs)2666 radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
2667 {
2668    /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
2669     * single array sorted in ascending order using:
2670     * - total number of attributes
2671     * - number of instanced attributes
2672     * - index of first instanced attribute
2673     */
2674 
2675    /* From total number of attributes to offset. */
2676    static const uint16_t total_to_offset[16] = {0,   1,   4,   10,  20,  35,  56,  84,
2677                                                 120, 165, 220, 286, 364, 455, 560, 680};
2678    unsigned start_index = total_to_offset[num_attributes - 1];
2679 
2680    /* From number of instanced attributes to offset. This would require a different LUT depending on
2681     * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
2682     * attributes.
2683     */
2684    static const uint8_t count_to_offset_total16[16] = {0,   16,  31,  45,  58,  70,  81,  91,
2685                                                        100, 108, 115, 121, 126, 130, 133, 135};
2686    unsigned count = util_bitcount(instance_rate_inputs);
2687    unsigned offset_from_start_index =
2688       count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
2689 
2690    unsigned first = ffs(instance_rate_inputs) - 1;
2691    return start_index + offset_from_start_index + first;
2692 }
2693 
2694 union vs_prolog_key_header {
2695    struct {
2696       uint32_t key_size : 8;
2697       uint32_t num_attributes : 6;
2698       uint32_t as_ls : 1;
2699       uint32_t is_ngg : 1;
2700       uint32_t wave32 : 1;
2701       uint32_t next_stage : 3;
2702       uint32_t instance_rate_inputs : 1;
2703       uint32_t alpha_adjust_lo : 1;
2704       uint32_t alpha_adjust_hi : 1;
2705       uint32_t misaligned_mask : 1;
2706       uint32_t post_shuffle : 1;
2707       uint32_t nontrivial_divisors : 1;
2708       /* We need this to ensure the padding is zero. It's useful even if it's unused. */
2709       uint32_t padding0 : 6;
2710    };
2711    uint32_t v;
2712 };
2713 
2714 uint32_t
radv_hash_vs_prolog(const void * key_)2715 radv_hash_vs_prolog(const void *key_)
2716 {
2717    const uint32_t *key = key_;
2718    union vs_prolog_key_header header;
2719    header.v = key[0];
2720    return _mesa_hash_data(key, header.key_size);
2721 }
2722 
2723 bool
radv_cmp_vs_prolog(const void * a_,const void * b_)2724 radv_cmp_vs_prolog(const void *a_, const void *b_)
2725 {
2726    const uint32_t *a = a_;
2727    const uint32_t *b = b_;
2728    if (a[0] != b[0])
2729       return false;
2730 
2731    union vs_prolog_key_header header;
2732    header.v = a[0];
2733    return memcmp(a, b, header.key_size) == 0;
2734 }
2735 
2736 static struct radv_shader_prolog *
lookup_vs_prolog(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_variant * vs_shader,uint32_t * nontrivial_divisors)2737 lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
2738                  uint32_t *nontrivial_divisors)
2739 {
2740    STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
2741    assert(vs_shader->info.vs.dynamic_inputs);
2742 
2743    const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
2744    const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2745    struct radv_device *device = cmd_buffer->device;
2746 
2747    unsigned num_attributes = pipeline->last_vertex_attrib_bit;
2748    uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
2749 
2750    uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
2751    *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
2752    enum chip_class chip = device->physical_device->rad_info.chip_class;
2753    const uint32_t misaligned_mask = chip == GFX6 || chip >= GFX10 ? cmd_buffer->state.vbo_misaligned_mask : 0;
2754 
2755    /* try to use a pre-compiled prolog first */
2756    struct radv_shader_prolog *prolog = NULL;
2757    if (pipeline->can_use_simple_input &&
2758        (!vs_shader->info.vs.as_ls || !instance_rate_inputs) &&
2759        !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) {
2760       if (!instance_rate_inputs) {
2761          prolog = device->simple_vs_prologs[num_attributes - 1];
2762       } else if (num_attributes <= 16 && !*nontrivial_divisors &&
2763                  util_bitcount(instance_rate_inputs) ==
2764                     (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
2765          unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
2766          prolog = device->instance_rate_vs_prologs[index];
2767       }
2768    }
2769    if (prolog)
2770       return prolog;
2771 
2772    /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
2773    uint32_t key_words[16];
2774    unsigned key_size = 1;
2775 
2776    struct radv_vs_prolog_key key;
2777    key.state = state;
2778    key.num_attributes = num_attributes;
2779    key.misaligned_mask = misaligned_mask;
2780    /* The instance ID input VGPR is placed differently when as_ls=true. */
2781    key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs;
2782    key.is_ngg = vs_shader->info.is_ngg;
2783    key.wave32 = vs_shader->info.wave_size == 32;
2784    key.next_stage = pipeline->next_vertex_stage;
2785 
2786    union vs_prolog_key_header header;
2787    header.v = 0;
2788    header.num_attributes = num_attributes;
2789    header.as_ls = key.as_ls;
2790    header.is_ngg = key.is_ngg;
2791    header.wave32 = key.wave32;
2792    header.next_stage = key.next_stage;
2793 
2794    if (instance_rate_inputs & ~*nontrivial_divisors) {
2795       header.instance_rate_inputs = true;
2796       key_words[key_size++] = instance_rate_inputs;
2797    }
2798    if (*nontrivial_divisors) {
2799       header.nontrivial_divisors = true;
2800       key_words[key_size++] = *nontrivial_divisors;
2801    }
2802    if (misaligned_mask) {
2803       header.misaligned_mask = true;
2804       key_words[key_size++] = misaligned_mask;
2805 
2806       uint8_t *formats = (uint8_t *)&key_words[key_size];
2807       unsigned num_formats = 0;
2808       u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index];
2809       while (num_formats & 0x3)
2810          formats[num_formats++] = 0;
2811       key_size += num_formats / 4u;
2812 
2813       if (state->post_shuffle & attribute_mask) {
2814          header.post_shuffle = true;
2815          key_words[key_size++] = state->post_shuffle & attribute_mask;
2816       }
2817    }
2818    if (state->alpha_adjust_lo & attribute_mask) {
2819       header.alpha_adjust_lo = true;
2820       key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
2821    }
2822    if (state->alpha_adjust_hi & attribute_mask) {
2823       header.alpha_adjust_hi = true;
2824       key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
2825    }
2826 
2827    header.key_size = key_size * sizeof(key_words[0]);
2828    key_words[0] = header.v;
2829 
2830    uint32_t hash = radv_hash_vs_prolog(key_words);
2831 
2832    if (cmd_buffer->state.emitted_vs_prolog &&
2833        cmd_buffer->state.emitted_vs_prolog_key_hash == hash &&
2834        radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key))
2835       return cmd_buffer->state.emitted_vs_prolog;
2836 
2837    u_rwlock_rdlock(&device->vs_prologs_lock);
2838    struct hash_entry *prolog_entry =
2839       _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
2840    u_rwlock_rdunlock(&device->vs_prologs_lock);
2841 
2842    if (!prolog_entry) {
2843       u_rwlock_wrlock(&device->vs_prologs_lock);
2844       prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
2845       if (prolog_entry) {
2846          u_rwlock_wrunlock(&device->vs_prologs_lock);
2847          return prolog_entry->data;
2848       }
2849 
2850       prolog = radv_create_vs_prolog(device, &key);
2851       uint32_t *key2 = malloc(key_size * 4);
2852       if (!prolog || !key2) {
2853          radv_prolog_destroy(device, prolog);
2854          free(key2);
2855          u_rwlock_wrunlock(&device->vs_prologs_lock);
2856          return NULL;
2857       }
2858       memcpy(key2, key_words, key_size * 4);
2859       _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
2860 
2861       u_rwlock_wrunlock(&device->vs_prologs_lock);
2862       return prolog;
2863    }
2864 
2865    return prolog_entry->data;
2866 }
2867 
2868 static void
emit_prolog_regs(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_variant * vs_shader,struct radv_shader_prolog * prolog,bool pipeline_is_dirty)2869 emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
2870                  struct radv_shader_prolog *prolog, bool pipeline_is_dirty)
2871 {
2872    /* no need to re-emit anything in this case */
2873    if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty)
2874       return;
2875 
2876    enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
2877    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2878    uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
2879 
2880    assert(cmd_buffer->state.emitted_pipeline == cmd_buffer->state.pipeline);
2881    assert(vs_shader->info.num_input_sgprs <= prolog->num_preserved_sgprs);
2882 
2883    uint32_t rsrc1 = vs_shader->config.rsrc1;
2884    if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1))
2885       rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
2886 
2887    /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
2888     * work.
2889     */
2890    assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
2891 
2892    unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
2893    unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
2894    if (vs_shader->info.is_ngg || pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) {
2895       pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
2896       rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
2897    } else if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader) {
2898       pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
2899       rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
2900    } else if (vs_shader->info.vs.as_ls) {
2901       pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
2902       rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
2903    } else if (vs_shader->info.vs.as_es) {
2904       pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
2905       rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
2906    }
2907 
2908    radeon_set_sh_reg_seq(cmd_buffer->cs, pgm_lo_reg, 2);
2909    radeon_emit(cmd_buffer->cs, prolog_va >> 8);
2910    radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(prolog_va >> 40));
2911 
2912    if (chip < GFX10)
2913       radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
2914    else
2915       assert(rsrc1 == vs_shader->config.rsrc1);
2916 
2917    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
2918 }
2919 
2920 static void
emit_prolog_inputs(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_variant * vs_shader,uint32_t nontrivial_divisors,bool pipeline_is_dirty)2921 emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
2922                    uint32_t nontrivial_divisors, bool pipeline_is_dirty)
2923 {
2924    /* no need to re-emit anything in this case */
2925    if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog &&
2926        !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
2927       return;
2928 
2929    struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
2930    uint64_t input_va = radv_shader_variant_get_va(vs_shader);
2931 
2932    if (nontrivial_divisors) {
2933       unsigned inputs_offset;
2934       uint32_t *inputs;
2935       unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
2936       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
2937          return;
2938 
2939       *(inputs++) = input_va;
2940       *(inputs++) = input_va >> 32;
2941 
2942       u_foreach_bit(index, nontrivial_divisors)
2943       {
2944          uint32_t div = state->divisors[index];
2945          if (div == 0) {
2946             *(inputs++) = 0;
2947             *(inputs++) = 1;
2948          } else if (util_is_power_of_two_or_zero(div)) {
2949             *(inputs++) = util_logbase2(div) | (1 << 8);
2950             *(inputs++) = 0xffffffffu;
2951          } else {
2952             struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
2953             *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
2954             *(inputs++) = info.multiplier;
2955          }
2956       }
2957 
2958       input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
2959    }
2960 
2961    struct radv_userdata_info *loc =
2962       &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
2963    uint32_t base_reg = cmd_buffer->state.pipeline->user_data_0[MESA_SHADER_VERTEX];
2964    assert(loc->sgpr_idx != -1);
2965    assert(loc->num_sgprs == 2);
2966    radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
2967                             input_va, true);
2968 }
2969 
2970 static void
radv_emit_vertex_input(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)2971 radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
2972 {
2973    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2974    struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
2975 
2976    if (!vs_shader->info.vs.has_prolog)
2977       return;
2978 
2979    uint32_t nontrivial_divisors;
2980    struct radv_shader_prolog *prolog =
2981       lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
2982    if (!prolog) {
2983       cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
2984       return;
2985    }
2986    emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
2987    emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
2988 
2989    cmd_buffer->state.emitted_vs_prolog = prolog;
2990 }
2991 
2992 static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)2993 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
2994 {
2995    uint64_t states =
2996       cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
2997 
2998    if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
2999       radv_emit_viewport(cmd_buffer);
3000 
3001    if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
3002        !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
3003       radv_emit_scissor(cmd_buffer);
3004 
3005    if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
3006       radv_emit_line_width(cmd_buffer);
3007 
3008    if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
3009       radv_emit_blend_constants(cmd_buffer);
3010 
3011    if (states &
3012        (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
3013         RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
3014       radv_emit_stencil(cmd_buffer);
3015 
3016    if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
3017       radv_emit_depth_bounds(cmd_buffer);
3018 
3019    if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
3020       radv_emit_depth_bias(cmd_buffer);
3021 
3022    if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
3023       radv_emit_discard_rectangle(cmd_buffer);
3024 
3025    if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
3026       radv_emit_sample_locations(cmd_buffer);
3027 
3028    if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE)
3029       radv_emit_line_stipple(cmd_buffer);
3030 
3031    if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
3032                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE))
3033       radv_emit_culling(cmd_buffer, states);
3034 
3035    if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
3036       radv_emit_primitive_topology(cmd_buffer);
3037 
3038    if (states &
3039        (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
3040         RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
3041         RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
3042       radv_emit_depth_control(cmd_buffer, states);
3043 
3044    if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
3045       radv_emit_stencil_control(cmd_buffer);
3046 
3047    if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE)
3048       radv_emit_fragment_shading_rate(cmd_buffer);
3049 
3050    if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
3051       radv_emit_primitive_restart_enable(cmd_buffer);
3052 
3053    if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
3054       radv_emit_rasterizer_discard_enable(cmd_buffer);
3055 
3056    if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP)
3057       radv_emit_logic_op(cmd_buffer);
3058 
3059    if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
3060       radv_emit_color_write_enable(cmd_buffer);
3061 
3062    if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT)
3063       radv_emit_vertex_input(cmd_buffer, pipeline_is_dirty);
3064 
3065    cmd_buffer->state.dirty &= ~states;
3066 }
3067 
3068 static void
radv_flush_push_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)3069 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
3070 {
3071    struct radv_descriptor_state *descriptors_state =
3072       radv_get_descriptors_state(cmd_buffer, bind_point);
3073    struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
3074    unsigned bo_offset;
3075 
3076    if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr,
3077                                     &bo_offset))
3078       return;
3079 
3080    set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3081    set->header.va += bo_offset;
3082 }
3083 
3084 static void
radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3085 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
3086                                     struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3087 {
3088    struct radv_descriptor_state *descriptors_state =
3089       radv_get_descriptors_state(cmd_buffer, bind_point);
3090    uint32_t size = MAX_SETS * 4;
3091    uint32_t offset;
3092    void *ptr;
3093 
3094    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
3095       return;
3096 
3097    for (unsigned i = 0; i < MAX_SETS; i++) {
3098       uint32_t *uptr = ((uint32_t *)ptr) + i;
3099       uint64_t set_va = 0;
3100       struct radv_descriptor_set *set = descriptors_state->sets[i];
3101       if (descriptors_state->valid & (1u << i))
3102          set_va = set->header.va;
3103       uptr[0] = set_va & 0xffffffff;
3104    }
3105 
3106    uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3107    va += offset;
3108 
3109    if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
3110       if (pipeline->shaders[MESA_SHADER_VERTEX])
3111          radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX,
3112                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3113 
3114       if (pipeline->shaders[MESA_SHADER_FRAGMENT])
3115          radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT,
3116                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3117 
3118       if (radv_pipeline_has_gs(pipeline))
3119          radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_GEOMETRY,
3120                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3121 
3122       if (radv_pipeline_has_tess(pipeline))
3123          radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_CTRL,
3124                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3125 
3126       if (radv_pipeline_has_tess(pipeline))
3127          radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_EVAL,
3128                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3129    } else {
3130       radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_COMPUTE,
3131                                  AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3132    }
3133 }
3134 
3135 static void
radv_flush_descriptors(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3136 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3137                        struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3138 {
3139    struct radv_descriptor_state *descriptors_state =
3140       radv_get_descriptors_state(cmd_buffer, bind_point);
3141    bool flush_indirect_descriptors;
3142 
3143    if (!descriptors_state->dirty)
3144       return;
3145 
3146    if (descriptors_state->push_dirty)
3147       radv_flush_push_descriptors(cmd_buffer, bind_point);
3148 
3149    flush_indirect_descriptors = pipeline && pipeline->need_indirect_descriptor_sets;
3150 
3151    if (flush_indirect_descriptors)
3152       radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point);
3153 
3154    ASSERTED unsigned cdw_max =
3155       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SETS * MESA_SHADER_STAGES * 4);
3156 
3157    if (pipeline) {
3158       if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
3159          radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state,
3160                                        MESA_SHADER_COMPUTE);
3161       } else {
3162          radv_foreach_stage(stage, stages)
3163          {
3164             if (!cmd_buffer->state.pipeline->shaders[stage])
3165                continue;
3166 
3167             radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state, stage);
3168          }
3169       }
3170    }
3171 
3172    descriptors_state->dirty = 0;
3173    descriptors_state->push_dirty = false;
3174 
3175    assert(cmd_buffer->cs->cdw <= cdw_max);
3176 
3177    if (unlikely(cmd_buffer->device->trace_bo))
3178       radv_save_descriptors(cmd_buffer, bind_point);
3179 }
3180 
3181 static bool
radv_shader_loads_push_constants(struct radv_pipeline * pipeline,gl_shader_stage stage)3182 radv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage)
3183 {
3184    struct radv_userdata_info *loc =
3185       radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS);
3186    return loc->sgpr_idx != -1;
3187 }
3188 
3189 static void
radv_flush_constants(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3190 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3191                      struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3192 {
3193    struct radv_descriptor_state *descriptors_state =
3194       radv_get_descriptors_state(cmd_buffer, bind_point);
3195    struct radv_shader_variant *shader, *prev_shader;
3196    bool need_push_constants = false;
3197    unsigned offset;
3198    void *ptr;
3199    uint64_t va;
3200    uint32_t internal_stages;
3201    uint32_t dirty_stages = 0;
3202 
3203    stages &= cmd_buffer->push_constant_stages;
3204    if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count))
3205       return;
3206 
3207    internal_stages = stages;
3208    switch (bind_point) {
3209    case VK_PIPELINE_BIND_POINT_GRAPHICS:
3210       break;
3211    case VK_PIPELINE_BIND_POINT_COMPUTE:
3212       dirty_stages = RADV_RT_STAGE_BITS;
3213       break;
3214    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
3215       internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3216       dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3217       break;
3218    default:
3219       unreachable("Unhandled bind point");
3220    }
3221 
3222    radv_foreach_stage(stage, internal_stages)
3223    {
3224       shader = radv_get_shader(pipeline, stage);
3225       if (!shader)
3226          continue;
3227 
3228       need_push_constants |= radv_shader_loads_push_constants(pipeline, stage);
3229 
3230       uint8_t base = shader->info.min_push_constant_used / 4;
3231 
3232       radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
3233                                    (uint32_t *)&cmd_buffer->push_constants[base * 4]);
3234    }
3235 
3236    if (need_push_constants) {
3237       if (!radv_cmd_buffer_upload_alloc(
3238              cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset,
3239              &ptr))
3240          return;
3241 
3242       memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size);
3243       memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers,
3244              16 * pipeline->dynamic_offset_count);
3245 
3246       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3247       va += offset;
3248 
3249       ASSERTED unsigned cdw_max =
3250          radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_SHADER_STAGES * 4);
3251 
3252       prev_shader = NULL;
3253       radv_foreach_stage(stage, internal_stages)
3254       {
3255          shader = radv_get_shader(pipeline, stage);
3256 
3257          /* Avoid redundantly emitting the address for merged stages. */
3258          if (shader && shader != prev_shader) {
3259             radv_emit_userdata_address(cmd_buffer, pipeline, stage, AC_UD_PUSH_CONSTANTS, va);
3260 
3261             prev_shader = shader;
3262          }
3263       }
3264       assert(cmd_buffer->cs->cdw <= cdw_max);
3265    }
3266 
3267    cmd_buffer->push_constant_stages &= ~stages;
3268    cmd_buffer->push_constant_stages |= dirty_stages;
3269 }
3270 
3271 enum radv_dst_sel {
3272    DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
3273                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3274    DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
3275                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3276    DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3277                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3278    DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3279                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3280    DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3281                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
3282    DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3283                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
3284 };
3285 
3286 static const uint32_t data_format_dst_sel[] = {
3287    [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001,
3288    [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001,
3289    [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001,
3290    [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01,
3291    [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001,
3292    [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01,
3293    [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1,
3294    [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1,
3295    [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW,
3296    [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW,
3297    [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW,
3298    [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01,
3299    [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW,
3300    [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1,
3301    [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
3302 };
3303 
3304 static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)3305 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3306 {
3307    if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
3308        cmd_buffer->state.pipeline->vb_desc_usage_mask) {
3309       struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
3310       struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
3311       enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
3312       unsigned vb_offset;
3313       void *vb_ptr;
3314       unsigned desc_index = 0;
3315       uint32_t mask = pipeline->vb_desc_usage_mask;
3316       uint64_t va;
3317       struct radv_vs_input_state *vs_state =
3318          vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
3319 
3320       /* allocate some descriptor state for vertex buffers */
3321       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr))
3322          return;
3323 
3324       assert(!vs_state || pipeline->use_per_attribute_vb_descs);
3325 
3326       while (mask) {
3327          unsigned i = u_bit_scan(&mask);
3328          uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
3329          uint32_t offset, rsrc_word3;
3330          unsigned binding =
3331             vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
3332                      : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
3333          struct radv_buffer *buffer = cmd_buffer->vertex_bindings[binding].buffer;
3334          unsigned num_records;
3335          unsigned stride;
3336 
3337          if (vs_state) {
3338             unsigned format = vs_state->formats[i];
3339             unsigned dfmt = format & 0xf;
3340             unsigned nfmt = (format >> 4) & 0x7;
3341 
3342             rsrc_word3 =
3343                vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
3344 
3345             if (chip >= GFX10)
3346                rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
3347             else
3348                rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
3349          } else {
3350             if (chip >= GFX10)
3351                rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
3352             else
3353                rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
3354                             S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3355          }
3356 
3357          if (!buffer) {
3358             if (vs_state) {
3359                /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
3360                 * to include the format/word3 so that the alpha channel is 1 for formats without an
3361                 * alpha channel.
3362                 */
3363                desc[0] = 0;
3364                desc[1] = S_008F04_STRIDE(16);
3365                desc[2] = 0;
3366                desc[3] = rsrc_word3;
3367             } else {
3368                memset(desc, 0, 4 * 4);
3369             }
3370             continue;
3371          }
3372 
3373          va = radv_buffer_get_va(buffer->bo);
3374 
3375          offset = cmd_buffer->vertex_bindings[binding].offset;
3376          va += offset + buffer->offset;
3377          if (vs_state)
3378             va += vs_state->offsets[i];
3379 
3380          if (cmd_buffer->vertex_bindings[binding].size) {
3381             num_records = cmd_buffer->vertex_bindings[binding].size;
3382          } else {
3383             num_records = buffer->size - offset;
3384          }
3385 
3386          if (pipeline->graphics.uses_dynamic_stride) {
3387             stride = cmd_buffer->vertex_bindings[binding].stride;
3388          } else {
3389             stride = pipeline->binding_stride[binding];
3390          }
3391 
3392          if (pipeline->use_per_attribute_vb_descs) {
3393             uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
3394                                            : pipeline->attrib_ends[i];
3395 
3396             if (num_records < attrib_end) {
3397                num_records = 0; /* not enough space for one vertex */
3398             } else if (stride == 0) {
3399                num_records = 1; /* only one vertex */
3400             } else {
3401                num_records = (num_records - attrib_end) / stride + 1;
3402                /* If attrib_offset>stride, then the compiler will increase the vertex index by
3403                 * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
3404                 * only allowed with static strides.
3405                 */
3406                num_records += pipeline->attrib_index_offset[i];
3407             }
3408 
3409             /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
3410              * into bytes in that case. GFX8 always uses bytes.
3411              */
3412             if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
3413                num_records = (num_records - 1) * stride + attrib_end;
3414             } else if (!num_records) {
3415                /* On GFX9, it seems bounds checking is disabled if both
3416                 * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
3417                 * GFX10.3 but it doesn't hurt.
3418                 */
3419                if (vs_state) {
3420                   desc[0] = 0;
3421                   desc[1] = S_008F04_STRIDE(16);
3422                   desc[2] = 0;
3423                   desc[3] = rsrc_word3;
3424                } else {
3425                   memset(desc, 0, 16);
3426                }
3427                continue;
3428             }
3429          } else {
3430             if (chip != GFX8 && stride)
3431                num_records = DIV_ROUND_UP(num_records, stride);
3432          }
3433 
3434          if (chip >= GFX10) {
3435             /* OOB_SELECT chooses the out-of-bounds check:
3436              * - 1: index >= NUM_RECORDS (Structured)
3437              * - 3: offset >= NUM_RECORDS (Raw)
3438              */
3439             int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
3440             rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1);
3441          }
3442 
3443          desc[0] = va;
3444          desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
3445          desc[2] = num_records;
3446          desc[3] = rsrc_word3;
3447       }
3448 
3449       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3450       va += vb_offset;
3451 
3452       radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS,
3453                                  va);
3454 
3455       cmd_buffer->state.vb_va = va;
3456       cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
3457 
3458       if (unlikely(cmd_buffer->device->trace_bo))
3459          radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
3460    }
3461    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
3462 }
3463 
3464 static void
radv_emit_streamout_buffers(struct radv_cmd_buffer * cmd_buffer,uint64_t va)3465 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
3466 {
3467    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
3468    struct radv_userdata_info *loc;
3469    uint32_t base_reg;
3470 
3471    for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
3472       if (!radv_get_shader(pipeline, stage))
3473          continue;
3474 
3475       loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_STREAMOUT_BUFFERS);
3476       if (loc->sgpr_idx == -1)
3477          continue;
3478 
3479       base_reg = pipeline->user_data_0[stage];
3480 
3481       radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
3482                                false);
3483    }
3484 
3485    if (radv_pipeline_has_gs_copy_shader(pipeline)) {
3486       loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
3487       if (loc->sgpr_idx != -1) {
3488          base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
3489 
3490          radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3491                                   va, false);
3492       }
3493    }
3494 }
3495 
3496 static void
radv_flush_streamout_descriptors(struct radv_cmd_buffer * cmd_buffer)3497 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
3498 {
3499    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
3500       struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
3501       struct radv_streamout_state *so = &cmd_buffer->state.streamout;
3502       unsigned so_offset;
3503       void *so_ptr;
3504       uint64_t va;
3505 
3506       /* Allocate some descriptor state for streamout buffers. */
3507       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
3508          return;
3509 
3510       for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
3511          struct radv_buffer *buffer = sb[i].buffer;
3512          uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
3513 
3514          if (!(so->enabled_mask & (1 << i)))
3515             continue;
3516 
3517          va = radv_buffer_get_va(buffer->bo) + buffer->offset;
3518 
3519          va += sb[i].offset;
3520 
3521          /* Set the descriptor.
3522           *
3523           * On GFX8, the format must be non-INVALID, otherwise
3524           * the buffer will be considered not bound and store
3525           * instructions will be no-ops.
3526           */
3527          uint32_t size = 0xffffffff;
3528 
3529          /* Compute the correct buffer size for NGG streamout
3530           * because it's used to determine the max emit per
3531           * buffer.
3532           */
3533          if (cmd_buffer->device->physical_device->use_ngg_streamout)
3534             size = buffer->size - sb[i].offset;
3535 
3536          uint32_t rsrc_word3 =
3537             S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3538             S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3539 
3540          if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
3541             rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
3542                           S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
3543          } else {
3544             rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3545          }
3546 
3547          desc[0] = va;
3548          desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3549          desc[2] = size;
3550          desc[3] = rsrc_word3;
3551       }
3552 
3553       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3554       va += so_offset;
3555 
3556       radv_emit_streamout_buffers(cmd_buffer, va);
3557    }
3558 
3559    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
3560 }
3561 
3562 static void
radv_flush_ngg_gs_state(struct radv_cmd_buffer * cmd_buffer)3563 radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer)
3564 {
3565    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
3566    struct radv_userdata_info *loc;
3567    uint32_t ngg_gs_state = 0;
3568    uint32_t base_reg;
3569 
3570    if (!radv_pipeline_has_gs(pipeline) || !pipeline->graphics.is_ngg)
3571       return;
3572 
3573    /* By default NGG GS queries are disabled but they are enabled if the
3574     * command buffer has active GDS queries or if it's a secondary command
3575     * buffer that inherits the number of generated primitives.
3576     */
3577    if (cmd_buffer->state.active_pipeline_gds_queries ||
3578        (cmd_buffer->state.inherited_pipeline_statistics &
3579         VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
3580       ngg_gs_state = 1;
3581 
3582    loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY, AC_UD_NGG_GS_STATE);
3583    base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY];
3584    assert(loc->sgpr_idx != -1);
3585 
3586    radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_gs_state);
3587 }
3588 
3589 static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)3590 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3591 {
3592    radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
3593    radv_flush_streamout_descriptors(cmd_buffer);
3594    radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline,
3595                           VK_PIPELINE_BIND_POINT_GRAPHICS);
3596    radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline,
3597                         VK_PIPELINE_BIND_POINT_GRAPHICS);
3598    radv_flush_ngg_gs_state(cmd_buffer);
3599 }
3600 
3601 struct radv_draw_info {
3602    /**
3603     * Number of vertices.
3604     */
3605    uint32_t count;
3606 
3607    /**
3608     * First instance id.
3609     */
3610    uint32_t first_instance;
3611 
3612    /**
3613     * Number of instances.
3614     */
3615    uint32_t instance_count;
3616 
3617    /**
3618     * Whether it's an indexed draw.
3619     */
3620    bool indexed;
3621 
3622    /**
3623     * Indirect draw parameters resource.
3624     */
3625    struct radv_buffer *indirect;
3626    uint64_t indirect_offset;
3627    uint32_t stride;
3628 
3629    /**
3630     * Draw count parameters resource.
3631     */
3632    struct radv_buffer *count_buffer;
3633    uint64_t count_buffer_offset;
3634 
3635    /**
3636     * Stream output parameters resource.
3637     */
3638    struct radv_buffer *strmout_buffer;
3639    uint64_t strmout_buffer_offset;
3640 };
3641 
3642 static uint32_t
radv_get_primitive_reset_index(struct radv_cmd_buffer * cmd_buffer)3643 radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
3644 {
3645    switch (cmd_buffer->state.index_type) {
3646    case V_028A7C_VGT_INDEX_8:
3647       return 0xffu;
3648    case V_028A7C_VGT_INDEX_16:
3649       return 0xffffu;
3650    case V_028A7C_VGT_INDEX_32:
3651       return 0xffffffffu;
3652    default:
3653       unreachable("invalid index type");
3654    }
3655 }
3656 
3657 static void
si_emit_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count)3658 si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw,
3659                            bool indirect_draw, bool count_from_stream_output,
3660                            uint32_t draw_vertex_count)
3661 {
3662    struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
3663    struct radv_cmd_state *state = &cmd_buffer->state;
3664    unsigned topology = state->dynamic.primitive_topology;
3665    bool prim_restart_enable = state->dynamic.primitive_restart_enable;
3666    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3667    unsigned ia_multi_vgt_param;
3668 
3669    ia_multi_vgt_param =
3670       si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
3671                                 draw_vertex_count, topology, prim_restart_enable);
3672 
3673    if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
3674       if (info->chip_class == GFX9) {
3675          radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
3676                                     R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
3677       } else if (info->chip_class >= GFX7) {
3678          radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
3679       } else {
3680          radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
3681       }
3682       state->last_ia_multi_vgt_param = ia_multi_vgt_param;
3683    }
3684 }
3685 
3686 static void
radv_emit_draw_registers(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)3687 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
3688 {
3689    struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
3690    struct radv_cmd_state *state = &cmd_buffer->state;
3691    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3692 
3693    /* Draw state. */
3694    if (info->chip_class < GFX10) {
3695       si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
3696                                  !!draw_info->strmout_buffer,
3697                                  draw_info->indirect ? 0 : draw_info->count);
3698    }
3699 
3700    if (state->dynamic.primitive_restart_enable) {
3701       uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
3702 
3703       if (primitive_reset_index != state->last_primitive_reset_index) {
3704          radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
3705          state->last_primitive_reset_index = primitive_reset_index;
3706       }
3707    }
3708 
3709    if (draw_info->strmout_buffer) {
3710       uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
3711 
3712       va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
3713 
3714       radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
3715 
3716       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
3717       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
3718                          COPY_DATA_WR_CONFIRM);
3719       radeon_emit(cs, va);
3720       radeon_emit(cs, va >> 32);
3721       radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
3722       radeon_emit(cs, 0); /* unused */
3723 
3724       radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
3725    }
3726 }
3727 
3728 static void
radv_stage_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags src_stage_mask)3729 radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_stage_mask)
3730 {
3731    if (src_stage_mask &
3732        (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT |
3733         VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
3734         VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
3735         VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
3736       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
3737    }
3738 
3739    if (src_stage_mask &
3740        (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
3741         VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
3742         VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
3743         VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
3744       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
3745    } else if (src_stage_mask &
3746               (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
3747                VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
3748                VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
3749                VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
3750                VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
3751                VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) {
3752       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
3753    }
3754 }
3755 
3756 static bool
can_skip_buffer_l2_flushes(struct radv_device * device)3757 can_skip_buffer_l2_flushes(struct radv_device *device)
3758 {
3759    return device->physical_device->rad_info.chip_class == GFX9 ||
3760           (device->physical_device->rad_info.chip_class >= GFX10 &&
3761            !device->physical_device->rad_info.tcc_rb_non_coherent);
3762 }
3763 
3764 /*
3765  * In vulkan barriers have two kinds of operations:
3766  *
3767  * - visibility (implemented with radv_src_access_flush)
3768  * - availability (implemented with radv_dst_access_flush)
3769  *
3770  * for a memory operation to observe the result of a previous memory operation
3771  * one needs to do a visibility operation from the source memory and then an
3772  * availability operation to the target memory.
3773  *
3774  * The complication is the availability and visibility operations do not need to
3775  * be in the same barrier.
3776  *
3777  * The cleanest way to implement this is to define the visibility operation to
3778  * bring the caches to a "state of rest", which none of the caches below that
3779  * level dirty.
3780  *
3781  * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
3782  *
3783  * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
3784  * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
3785  * images. However, given the existence of memory barriers which do not specify
3786  * the image/buffer it often devolves to just VRAM/GTT anyway.
3787  *
3788  * To help reducing the invalidations for GPUs that have L2 coherency between the
3789  * RB and the shader caches, we always invalidate L2 on the src side, as we can
3790  * use our knowledge of past usage to optimize flushes away.
3791  */
3792 
3793 enum radv_cmd_flush_bits
radv_src_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags src_flags,const struct radv_image * image)3794 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flags,
3795                       const struct radv_image *image)
3796 {
3797    bool has_CB_meta = true, has_DB_meta = true;
3798    bool image_is_coherent = image ? image->l2_coherent : false;
3799    enum radv_cmd_flush_bits flush_bits = 0;
3800 
3801    if (image) {
3802       if (!radv_image_has_CB_metadata(image))
3803          has_CB_meta = false;
3804       if (!radv_image_has_htile(image))
3805          has_DB_meta = false;
3806    }
3807 
3808    u_foreach_bit(b, src_flags)
3809    {
3810       switch ((VkAccessFlagBits)(1 << b)) {
3811       case VK_ACCESS_SHADER_WRITE_BIT:
3812          /* since the STORAGE bit isn't set we know that this is a meta operation.
3813           * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
3814           * set it here. */
3815          if (image && !(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
3816             if (vk_format_is_depth_or_stencil(image->vk_format)) {
3817                flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3818             } else {
3819                flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3820             }
3821          }
3822 
3823          /* This is valid even for the rb_noncoherent_dirty case, because with how we account for
3824           * dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need
3825           * invalidating. */
3826          if (!image_is_coherent)
3827             flush_bits |= RADV_CMD_FLAG_WB_L2;
3828          break;
3829       case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
3830       case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3831       case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3832          if (!image_is_coherent)
3833             flush_bits |= RADV_CMD_FLAG_WB_L2;
3834          break;
3835       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
3836          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3837          if (has_CB_meta)
3838             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3839          break;
3840       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3841          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3842          if (has_DB_meta)
3843             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3844          break;
3845       case VK_ACCESS_TRANSFER_WRITE_BIT:
3846          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3847 
3848          if (!image_is_coherent)
3849             flush_bits |= RADV_CMD_FLAG_INV_L2;
3850          if (has_CB_meta)
3851             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3852          if (has_DB_meta)
3853             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3854          break;
3855       case VK_ACCESS_MEMORY_WRITE_BIT:
3856          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3857 
3858          if (!image_is_coherent)
3859             flush_bits |= RADV_CMD_FLAG_INV_L2;
3860          if (has_CB_meta)
3861             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3862          if (has_DB_meta)
3863             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3864          break;
3865       default:
3866          break;
3867       }
3868    }
3869    return flush_bits;
3870 }
3871 
3872 enum radv_cmd_flush_bits
radv_dst_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags dst_flags,const struct radv_image * image)3873 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flags,
3874                       const struct radv_image *image)
3875 {
3876    bool has_CB_meta = true, has_DB_meta = true;
3877    enum radv_cmd_flush_bits flush_bits = 0;
3878    bool flush_CB = true, flush_DB = true;
3879    bool image_is_coherent = image ? image->l2_coherent : false;
3880 
3881    if (image) {
3882       if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
3883          flush_CB = false;
3884          flush_DB = false;
3885       }
3886 
3887       if (!radv_image_has_CB_metadata(image))
3888          has_CB_meta = false;
3889       if (!radv_image_has_htile(image))
3890          has_DB_meta = false;
3891    }
3892 
3893    /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
3894     * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
3895    image_is_coherent |=
3896       can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty;
3897 
3898    u_foreach_bit(b, dst_flags)
3899    {
3900       switch ((VkAccessFlagBits)(1 << b)) {
3901       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
3902       case VK_ACCESS_INDEX_READ_BIT:
3903       case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3904          break;
3905       case VK_ACCESS_UNIFORM_READ_BIT:
3906          flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
3907          break;
3908       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
3909       case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
3910       case VK_ACCESS_TRANSFER_READ_BIT:
3911       case VK_ACCESS_TRANSFER_WRITE_BIT:
3912          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
3913 
3914          if (has_CB_meta || has_DB_meta)
3915             flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
3916          if (!image_is_coherent)
3917             flush_bits |= RADV_CMD_FLAG_INV_L2;
3918          break;
3919       case VK_ACCESS_SHADER_READ_BIT:
3920          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
3921          /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
3922           * invalidate the scalar cache. */
3923          if (!cmd_buffer->device->physical_device->use_llvm && !image)
3924             flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
3925 
3926          if (has_CB_meta || has_DB_meta)
3927             flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
3928          if (!image_is_coherent)
3929             flush_bits |= RADV_CMD_FLAG_INV_L2;
3930          break;
3931       case VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_KHR:
3932          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
3933          if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
3934             flush_bits |= RADV_CMD_FLAG_INV_L2;
3935          break;
3936       case VK_ACCESS_SHADER_WRITE_BIT:
3937       case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
3938          break;
3939       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
3940       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
3941          if (flush_CB)
3942             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3943          if (has_CB_meta)
3944             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3945          break;
3946       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
3947       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3948          if (flush_DB)
3949             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3950          if (has_DB_meta)
3951             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3952          break;
3953       case VK_ACCESS_MEMORY_READ_BIT:
3954       case VK_ACCESS_MEMORY_WRITE_BIT:
3955          flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
3956          if (!image_is_coherent)
3957             flush_bits |= RADV_CMD_FLAG_INV_L2;
3958          if (flush_CB)
3959             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3960          if (has_CB_meta)
3961             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3962          if (flush_DB)
3963             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3964          if (has_DB_meta)
3965             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3966          break;
3967       default:
3968          break;
3969       }
3970    }
3971    return flush_bits;
3972 }
3973 
3974 void
radv_emit_subpass_barrier(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass_barrier * barrier)3975 radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier)
3976 {
3977    struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
3978    if (fb && !fb->imageless) {
3979       for (int i = 0; i < fb->attachment_count; ++i) {
3980          cmd_buffer->state.flush_bits |=
3981             radv_src_access_flush(cmd_buffer, barrier->src_access_mask, fb->attachments[i]->image);
3982       }
3983    } else {
3984       cmd_buffer->state.flush_bits |=
3985          radv_src_access_flush(cmd_buffer, barrier->src_access_mask, NULL);
3986    }
3987 
3988    radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
3989 
3990    if (fb && !fb->imageless) {
3991       for (int i = 0; i < fb->attachment_count; ++i) {
3992          cmd_buffer->state.flush_bits |=
3993             radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, fb->attachments[i]->image);
3994       }
3995    } else {
3996       cmd_buffer->state.flush_bits |=
3997          radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, NULL);
3998    }
3999 }
4000 
4001 uint32_t
radv_get_subpass_id(struct radv_cmd_buffer * cmd_buffer)4002 radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
4003 {
4004    struct radv_cmd_state *state = &cmd_buffer->state;
4005    uint32_t subpass_id = state->subpass - state->pass->subpasses;
4006 
4007    /* The id of this subpass shouldn't exceed the number of subpasses in
4008     * this render pass minus 1.
4009     */
4010    assert(subpass_id < state->pass->subpass_count);
4011    return subpass_id;
4012 }
4013 
4014 static struct radv_sample_locations_state *
radv_get_attachment_sample_locations(struct radv_cmd_buffer * cmd_buffer,uint32_t att_idx,bool begin_subpass)4015 radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx,
4016                                      bool begin_subpass)
4017 {
4018    struct radv_cmd_state *state = &cmd_buffer->state;
4019    uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
4020    struct radv_image_view *view = state->attachments[att_idx].iview;
4021 
4022    if (view->image->info.samples == 1)
4023       return NULL;
4024 
4025    if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
4026       /* Return the initial sample locations if this is the initial
4027        * layout transition of the given subpass attachemnt.
4028        */
4029       if (state->attachments[att_idx].sample_location.count > 0)
4030          return &state->attachments[att_idx].sample_location;
4031    } else {
4032       /* Otherwise return the subpass sample locations if defined. */
4033       if (state->subpass_sample_locs) {
4034          /* Because the driver sets the current subpass before
4035           * initial layout transitions, we should use the sample
4036           * locations from the previous subpass to avoid an
4037           * off-by-one problem. Otherwise, use the sample
4038           * locations for the current subpass for final layout
4039           * transitions.
4040           */
4041          if (begin_subpass)
4042             subpass_id--;
4043 
4044          for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
4045             if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
4046                return &state->subpass_sample_locs[i].sample_location;
4047          }
4048       }
4049    }
4050 
4051    return NULL;
4052 }
4053 
4054 static void
radv_handle_subpass_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_subpass_attachment att,bool begin_subpass)4055 radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
4056                                      struct radv_subpass_attachment att, bool begin_subpass)
4057 {
4058    unsigned idx = att.attachment;
4059    struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
4060    struct radv_sample_locations_state *sample_locs;
4061    VkImageSubresourceRange range;
4062    range.aspectMask = view->aspect_mask;
4063    range.baseMipLevel = view->base_mip;
4064    range.levelCount = 1;
4065    range.baseArrayLayer = view->base_layer;
4066    range.layerCount = cmd_buffer->state.framebuffer->layers;
4067 
4068    if (cmd_buffer->state.subpass->view_mask) {
4069       /* If the current subpass uses multiview, the driver might have
4070        * performed a fast color/depth clear to the whole image
4071        * (including all layers). To make sure the driver will
4072        * decompress the image correctly (if needed), we have to
4073        * account for the "real" number of layers. If the view mask is
4074        * sparse, this will decompress more layers than needed.
4075        */
4076       range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
4077    }
4078 
4079    /* Get the subpass sample locations for the given attachment, if NULL
4080     * is returned the driver will use the default HW locations.
4081     */
4082    sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass);
4083 
4084    /* Determine if the subpass uses separate depth/stencil layouts. */
4085    bool uses_separate_depth_stencil_layouts = false;
4086    if ((cmd_buffer->state.attachments[idx].current_layout !=
4087         cmd_buffer->state.attachments[idx].current_stencil_layout) ||
4088        (att.layout != att.stencil_layout)) {
4089       uses_separate_depth_stencil_layouts = true;
4090    }
4091 
4092    /* For separate layouts, perform depth and stencil transitions
4093     * separately.
4094     */
4095    if (uses_separate_depth_stencil_layouts &&
4096        (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
4097       /* Depth-only transitions. */
4098       range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
4099       radv_handle_image_transition(cmd_buffer, view->image,
4100                                    cmd_buffer->state.attachments[idx].current_layout,
4101                                    cmd_buffer->state.attachments[idx].current_in_render_loop,
4102                                    att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
4103 
4104       /* Stencil-only transitions. */
4105       range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
4106       radv_handle_image_transition(
4107          cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_stencil_layout,
4108          cmd_buffer->state.attachments[idx].current_in_render_loop, att.stencil_layout,
4109          att.in_render_loop, 0, 0, &range, sample_locs);
4110    } else {
4111       radv_handle_image_transition(cmd_buffer, view->image,
4112                                    cmd_buffer->state.attachments[idx].current_layout,
4113                                    cmd_buffer->state.attachments[idx].current_in_render_loop,
4114                                    att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
4115    }
4116 
4117    cmd_buffer->state.attachments[idx].current_layout = att.layout;
4118    cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout;
4119    cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop;
4120 }
4121 
4122 void
radv_cmd_buffer_set_subpass(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass * subpass)4123 radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass *subpass)
4124 {
4125    cmd_buffer->state.subpass = subpass;
4126 
4127    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
4128 }
4129 
4130 static VkResult
radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer * cmd_buffer,struct radv_render_pass * pass,const VkRenderPassBeginInfo * info)4131 radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
4132                                       struct radv_render_pass *pass,
4133                                       const VkRenderPassBeginInfo *info)
4134 {
4135    const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
4136       vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
4137    struct radv_cmd_state *state = &cmd_buffer->state;
4138 
4139    if (!sample_locs) {
4140       state->subpass_sample_locs = NULL;
4141       return VK_SUCCESS;
4142    }
4143 
4144    for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
4145       const VkAttachmentSampleLocationsEXT *att_sample_locs =
4146          &sample_locs->pAttachmentInitialSampleLocations[i];
4147       uint32_t att_idx = att_sample_locs->attachmentIndex;
4148       struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
4149 
4150       assert(vk_format_is_depth_or_stencil(image->vk_format));
4151 
4152       /* From the Vulkan spec 1.1.108:
4153        *
4154        * "If the image referenced by the framebuffer attachment at
4155        *  index attachmentIndex was not created with
4156        *  VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
4157        *  then the values specified in sampleLocationsInfo are
4158        *  ignored."
4159        */
4160       if (!(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
4161          continue;
4162 
4163       const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo;
4164 
4165       state->attachments[att_idx].sample_location.per_pixel =
4166          sample_locs_info->sampleLocationsPerPixel;
4167       state->attachments[att_idx].sample_location.grid_size =
4168          sample_locs_info->sampleLocationGridSize;
4169       state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount;
4170       typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
4171                    sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4172    }
4173 
4174    state->subpass_sample_locs =
4175       vk_alloc(&cmd_buffer->pool->alloc,
4176                sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]),
4177                8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4178    if (state->subpass_sample_locs == NULL) {
4179       cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4180       return cmd_buffer->record_result;
4181    }
4182 
4183    state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
4184 
4185    for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
4186       const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
4187          &sample_locs->pPostSubpassSampleLocations[i];
4188       const VkSampleLocationsInfoEXT *sample_locs_info =
4189          &subpass_sample_locs_info->sampleLocationsInfo;
4190 
4191       state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex;
4192       state->subpass_sample_locs[i].sample_location.per_pixel =
4193          sample_locs_info->sampleLocationsPerPixel;
4194       state->subpass_sample_locs[i].sample_location.grid_size =
4195          sample_locs_info->sampleLocationGridSize;
4196       state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount;
4197       typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
4198                    sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4199    }
4200 
4201    return VK_SUCCESS;
4202 }
4203 
4204 static VkResult
radv_cmd_state_setup_attachments(struct radv_cmd_buffer * cmd_buffer,struct radv_render_pass * pass,const VkRenderPassBeginInfo * info,const struct radv_extra_render_pass_begin_info * extra)4205 radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass,
4206                                  const VkRenderPassBeginInfo *info,
4207                                  const struct radv_extra_render_pass_begin_info *extra)
4208 {
4209    struct radv_cmd_state *state = &cmd_buffer->state;
4210    const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL;
4211 
4212    if (info) {
4213       attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
4214    }
4215 
4216    if (pass->attachment_count == 0) {
4217       state->attachments = NULL;
4218       return VK_SUCCESS;
4219    }
4220 
4221    state->attachments =
4222       vk_alloc(&cmd_buffer->pool->alloc, pass->attachment_count * sizeof(state->attachments[0]), 8,
4223                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4224    if (state->attachments == NULL) {
4225       cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4226       return cmd_buffer->record_result;
4227    }
4228 
4229    for (uint32_t i = 0; i < pass->attachment_count; ++i) {
4230       struct radv_render_pass_attachment *att = &pass->attachments[i];
4231       VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
4232       VkImageAspectFlags clear_aspects = 0;
4233 
4234       if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
4235          /* color attachment */
4236          if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4237             clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
4238          }
4239       } else {
4240          /* depthstencil attachment */
4241          if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
4242              att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4243             clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
4244             if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4245                 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
4246                clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4247          }
4248          if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4249              att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4250             clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4251          }
4252       }
4253 
4254       state->attachments[i].pending_clear_aspects = clear_aspects;
4255       state->attachments[i].cleared_views = 0;
4256       if (clear_aspects && info) {
4257          assert(info->clearValueCount > i);
4258          state->attachments[i].clear_value = info->pClearValues[i];
4259       }
4260 
4261       state->attachments[i].current_layout = att->initial_layout;
4262       state->attachments[i].current_in_render_loop = false;
4263       state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
4264       state->attachments[i].disable_dcc = extra && extra->disable_dcc;
4265       state->attachments[i].sample_location.count = 0;
4266 
4267       struct radv_image_view *iview;
4268       if (attachment_info && attachment_info->attachmentCount > i) {
4269          iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
4270       } else {
4271          iview = state->framebuffer->attachments[i];
4272       }
4273 
4274       state->attachments[i].iview = iview;
4275       if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4276          radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
4277       } else {
4278          radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
4279       }
4280    }
4281 
4282    return VK_SUCCESS;
4283 }
4284 
4285 VkResult
radv_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)4286 radv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo,
4287                             VkCommandBuffer *pCommandBuffers)
4288 {
4289    RADV_FROM_HANDLE(radv_device, device, _device);
4290    RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
4291 
4292    VkResult result = VK_SUCCESS;
4293    uint32_t i;
4294 
4295    for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
4296 
4297       if (!list_is_empty(&pool->free_cmd_buffers)) {
4298          struct radv_cmd_buffer *cmd_buffer =
4299             list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
4300 
4301          list_del(&cmd_buffer->pool_link);
4302          list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
4303 
4304          result = radv_reset_cmd_buffer(cmd_buffer);
4305          cmd_buffer->level = pAllocateInfo->level;
4306          vk_command_buffer_finish(&cmd_buffer->vk);
4307          VkResult init_result =
4308             vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
4309          if (init_result != VK_SUCCESS)
4310             result = init_result;
4311 
4312          pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
4313       } else {
4314          result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]);
4315       }
4316       if (result != VK_SUCCESS)
4317          break;
4318    }
4319 
4320    if (result != VK_SUCCESS) {
4321       radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers);
4322 
4323       /* From the Vulkan 1.0.66 spec:
4324        *
4325        * "vkAllocateCommandBuffers can be used to create multiple
4326        *  command buffers. If the creation of any of those command
4327        *  buffers fails, the implementation must destroy all
4328        *  successfully created command buffer objects from this
4329        *  command, set all entries of the pCommandBuffers array to
4330        *  NULL and return the error."
4331        */
4332       memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
4333    }
4334 
4335    return result;
4336 }
4337 
4338 void
radv_FreeCommandBuffers(VkDevice device,VkCommandPool commandPool,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)4339 radv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount,
4340                         const VkCommandBuffer *pCommandBuffers)
4341 {
4342    for (uint32_t i = 0; i < commandBufferCount; i++) {
4343       RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
4344 
4345       if (cmd_buffer) {
4346          if (cmd_buffer->pool) {
4347             list_del(&cmd_buffer->pool_link);
4348             list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers);
4349          } else
4350             radv_destroy_cmd_buffer(cmd_buffer);
4351       }
4352    }
4353 }
4354 
4355 VkResult
radv_ResetCommandBuffer(VkCommandBuffer commandBuffer,VkCommandBufferResetFlags flags)4356 radv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags)
4357 {
4358    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4359    return radv_reset_cmd_buffer(cmd_buffer);
4360 }
4361 
4362 VkResult
radv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)4363 radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
4364 {
4365    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4366    VkResult result = VK_SUCCESS;
4367 
4368    if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
4369       /* If the command buffer has already been resetted with
4370        * vkResetCommandBuffer, no need to do it again.
4371        */
4372       result = radv_reset_cmd_buffer(cmd_buffer);
4373       if (result != VK_SUCCESS)
4374          return result;
4375    }
4376 
4377    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
4378    cmd_buffer->state.last_primitive_reset_en = -1;
4379    cmd_buffer->state.last_index_type = -1;
4380    cmd_buffer->state.last_num_instances = -1;
4381    cmd_buffer->state.last_vertex_offset = -1;
4382    cmd_buffer->state.last_first_instance = -1;
4383    cmd_buffer->state.last_drawid = -1;
4384    cmd_buffer->state.predication_type = -1;
4385    cmd_buffer->state.last_sx_ps_downconvert = -1;
4386    cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
4387    cmd_buffer->state.last_sx_blend_opt_control = -1;
4388    cmd_buffer->state.last_nggc_settings = -1;
4389    cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
4390    cmd_buffer->usage_flags = pBeginInfo->flags;
4391 
4392    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
4393        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
4394       assert(pBeginInfo->pInheritanceInfo);
4395       cmd_buffer->state.framebuffer =
4396          radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
4397       cmd_buffer->state.pass =
4398          radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
4399 
4400       struct radv_subpass *subpass =
4401          &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
4402 
4403       if (cmd_buffer->state.framebuffer) {
4404          result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL, NULL);
4405          if (result != VK_SUCCESS)
4406             return result;
4407       }
4408 
4409       cmd_buffer->state.inherited_pipeline_statistics =
4410          pBeginInfo->pInheritanceInfo->pipelineStatistics;
4411 
4412       radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
4413    }
4414 
4415    if (unlikely(cmd_buffer->device->trace_bo))
4416       radv_cmd_buffer_trace_emit(cmd_buffer);
4417 
4418    radv_describe_begin_cmd_buffer(cmd_buffer);
4419 
4420    cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
4421 
4422    return result;
4423 }
4424 
4425 void
radv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)4426 radv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, uint32_t firstBinding,
4427                           uint32_t bindingCount, const VkBuffer *pBuffers,
4428                           const VkDeviceSize *pOffsets)
4429 {
4430    radv_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount, pBuffers, pOffsets,
4431                                  NULL, NULL);
4432 }
4433 
4434 void
radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)4435 radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
4436                               uint32_t bindingCount, const VkBuffer *pBuffers,
4437                               const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
4438                               const VkDeviceSize *pStrides)
4439 {
4440    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4441    struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
4442    struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
4443    bool changed = false;
4444 
4445    /* We have to defer setting up vertex buffer since we need the buffer
4446     * stride from the pipeline. */
4447 
4448    assert(firstBinding + bindingCount <= MAX_VBS);
4449    cmd_buffer->state.vbo_misaligned_mask = state->misaligned_mask;
4450    enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
4451    for (uint32_t i = 0; i < bindingCount; i++) {
4452       RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
4453       uint32_t idx = firstBinding + i;
4454       VkDeviceSize size = pSizes ? pSizes[i] : 0;
4455       VkDeviceSize stride = pStrides ? pStrides[i] : 0;
4456 
4457       /* pSizes and pStrides are optional. */
4458       if (!changed && (vb[idx].buffer != buffer || vb[idx].offset != pOffsets[i] ||
4459                        vb[idx].size != size || (pStrides && vb[idx].stride != stride))) {
4460          changed = true;
4461       }
4462 
4463       vb[idx].buffer = buffer;
4464       vb[idx].offset = pOffsets[i];
4465       vb[idx].size = size;
4466       /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
4467 
4468       if (chip == GFX6 || chip >= GFX10) {
4469          const uint32_t bit = 1u << idx;
4470          if (!buffer) {
4471             cmd_buffer->state.vbo_misaligned_mask &= ~bit;
4472             cmd_buffer->state.vbo_bound_mask &= ~bit;
4473          } else {
4474             cmd_buffer->state.vbo_bound_mask |= bit;
4475             if (pStrides && vb[idx].stride != stride) {
4476                if (stride & state->format_align_req_minus_1[idx])
4477                   cmd_buffer->state.vbo_misaligned_mask |= bit;
4478                else
4479                   cmd_buffer->state.vbo_misaligned_mask &= ~bit;
4480             }
4481             if (state->possibly_misaligned_mask & bit &&
4482                 (vb[idx].offset + state->offsets[idx]) & state->format_align_req_minus_1[idx])
4483                cmd_buffer->state.vbo_misaligned_mask |= bit;
4484          }
4485       }
4486 
4487       if (pStrides)
4488          vb[idx].stride = stride;
4489 
4490       if (buffer) {
4491          radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, vb[idx].buffer->bo);
4492       }
4493    }
4494 
4495    if (!changed) {
4496       /* No state changes. */
4497       return;
4498    }
4499 
4500    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
4501                               RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
4502 }
4503 
4504 static uint32_t
vk_to_index_type(VkIndexType type)4505 vk_to_index_type(VkIndexType type)
4506 {
4507    switch (type) {
4508    case VK_INDEX_TYPE_UINT8_EXT:
4509       return V_028A7C_VGT_INDEX_8;
4510    case VK_INDEX_TYPE_UINT16:
4511       return V_028A7C_VGT_INDEX_16;
4512    case VK_INDEX_TYPE_UINT32:
4513       return V_028A7C_VGT_INDEX_32;
4514    default:
4515       unreachable("invalid index type");
4516    }
4517 }
4518 
4519 static uint32_t
radv_get_vgt_index_size(uint32_t type)4520 radv_get_vgt_index_size(uint32_t type)
4521 {
4522    switch (type) {
4523    case V_028A7C_VGT_INDEX_8:
4524       return 1;
4525    case V_028A7C_VGT_INDEX_16:
4526       return 2;
4527    case V_028A7C_VGT_INDEX_32:
4528       return 4;
4529    default:
4530       unreachable("invalid index type");
4531    }
4532 }
4533 
4534 void
radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)4535 radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
4536                         VkIndexType indexType)
4537 {
4538    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4539    RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
4540 
4541    if (cmd_buffer->state.index_buffer == index_buffer && cmd_buffer->state.index_offset == offset &&
4542        cmd_buffer->state.index_type == indexType) {
4543       /* No state changes. */
4544       return;
4545    }
4546 
4547    cmd_buffer->state.index_buffer = index_buffer;
4548    cmd_buffer->state.index_offset = offset;
4549    cmd_buffer->state.index_type = vk_to_index_type(indexType);
4550    cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
4551    cmd_buffer->state.index_va += index_buffer->offset + offset;
4552 
4553    int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
4554    cmd_buffer->state.max_index_count = (index_buffer->size - offset) / index_size;
4555    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
4556    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
4557 }
4558 
4559 static void
radv_bind_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)4560 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
4561                          struct radv_descriptor_set *set, unsigned idx)
4562 {
4563    struct radeon_winsys *ws = cmd_buffer->device->ws;
4564 
4565    radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
4566 
4567    assert(set);
4568 
4569    if (!cmd_buffer->device->use_global_bo_list) {
4570       for (unsigned j = 0; j < set->header.buffer_count; ++j)
4571          if (set->descriptors[j])
4572             radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
4573    }
4574 
4575    if (set->header.bo)
4576       radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
4577 }
4578 
4579 void
radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)4580 radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
4581                            VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount,
4582                            const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount,
4583                            const uint32_t *pDynamicOffsets)
4584 {
4585    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4586    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4587    unsigned dyn_idx = 0;
4588 
4589    const bool no_dynamic_bounds =
4590       cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
4591    struct radv_descriptor_state *descriptors_state =
4592       radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
4593 
4594    for (unsigned i = 0; i < descriptorSetCount; ++i) {
4595       unsigned set_idx = i + firstSet;
4596       RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
4597 
4598       /* If the set is already bound we only need to update the
4599        * (potentially changed) dynamic offsets. */
4600       if (descriptors_state->sets[set_idx] != set ||
4601           !(descriptors_state->valid & (1u << set_idx))) {
4602          radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx);
4603       }
4604 
4605       for (unsigned j = 0; j < layout->set[set_idx].dynamic_offset_count; ++j, ++dyn_idx) {
4606          unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
4607          uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
4608          assert(dyn_idx < dynamicOffsetCount);
4609 
4610          struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
4611 
4612          if (!range->va) {
4613             memset(dst, 0, 4 * 4);
4614          } else {
4615             uint64_t va = range->va + pDynamicOffsets[dyn_idx];
4616             dst[0] = va;
4617             dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
4618             dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
4619             dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4620                      S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
4621 
4622             if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
4623                dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
4624                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
4625             } else {
4626                dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4627                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4628             }
4629          }
4630 
4631          cmd_buffer->push_constant_stages |= layout->set[set_idx].dynamic_offset_stages;
4632       }
4633    }
4634 }
4635 
4636 static bool
radv_init_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_set * set,struct radv_descriptor_set_layout * layout,VkPipelineBindPoint bind_point)4637 radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
4638                               struct radv_descriptor_set_layout *layout,
4639                               VkPipelineBindPoint bind_point)
4640 {
4641    struct radv_descriptor_state *descriptors_state =
4642       radv_get_descriptors_state(cmd_buffer, bind_point);
4643    set->header.size = layout->size;
4644    set->header.layout = layout;
4645 
4646    if (descriptors_state->push_set.capacity < set->header.size) {
4647       size_t new_size = MAX2(set->header.size, 1024);
4648       new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
4649       new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
4650 
4651       free(set->header.mapped_ptr);
4652       set->header.mapped_ptr = malloc(new_size);
4653 
4654       if (!set->header.mapped_ptr) {
4655          descriptors_state->push_set.capacity = 0;
4656          cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4657          return false;
4658       }
4659 
4660       descriptors_state->push_set.capacity = new_size;
4661    }
4662 
4663    return true;
4664 }
4665 
4666 void
radv_meta_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)4667 radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
4668                               VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout,
4669                               uint32_t set, uint32_t descriptorWriteCount,
4670                               const VkWriteDescriptorSet *pDescriptorWrites)
4671 {
4672    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4673    struct radv_descriptor_set *push_set =
4674       (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
4675    unsigned bo_offset;
4676 
4677    assert(set == 0);
4678    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4679 
4680    push_set->header.size = layout->set[set].layout->size;
4681    push_set->header.layout = layout->set[set].layout;
4682 
4683    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
4684                                      (void **)&push_set->header.mapped_ptr))
4685       return;
4686 
4687    push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
4688    push_set->header.va += bo_offset;
4689 
4690    radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
4691                                radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
4692                                pDescriptorWrites, 0, NULL);
4693 
4694    radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
4695 }
4696 
4697 void
radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)4698 radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
4699                              VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
4700                              const VkWriteDescriptorSet *pDescriptorWrites)
4701 {
4702    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4703    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4704    struct radv_descriptor_state *descriptors_state =
4705       radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
4706    struct radv_descriptor_set *push_set =
4707       (struct radv_descriptor_set *)&descriptors_state->push_set.set;
4708 
4709    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4710 
4711    if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
4712                                       pipelineBindPoint))
4713       return;
4714 
4715    /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
4716     * because it is invalid, according to Vulkan spec.
4717     */
4718    for (int i = 0; i < descriptorWriteCount; i++) {
4719       ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
4720       assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
4721    }
4722 
4723    radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
4724                                radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
4725                                pDescriptorWrites, 0, NULL);
4726 
4727    radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
4728    descriptors_state->push_dirty = true;
4729 }
4730 
4731 void
radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,VkDescriptorUpdateTemplate descriptorUpdateTemplate,VkPipelineLayout _layout,uint32_t set,const void * pData)4732 radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
4733                                          VkDescriptorUpdateTemplate descriptorUpdateTemplate,
4734                                          VkPipelineLayout _layout, uint32_t set, const void *pData)
4735 {
4736    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4737    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4738    RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
4739    struct radv_descriptor_state *descriptors_state =
4740       radv_get_descriptors_state(cmd_buffer, templ->bind_point);
4741    struct radv_descriptor_set *push_set =
4742       (struct radv_descriptor_set *)&descriptors_state->push_set.set;
4743 
4744    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4745 
4746    if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
4747                                       templ->bind_point))
4748       return;
4749 
4750    radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
4751                                             descriptorUpdateTemplate, pData);
4752 
4753    radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
4754    descriptors_state->push_dirty = true;
4755 }
4756 
4757 void
radv_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)4758 radv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
4759                       VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
4760                       const void *pValues)
4761 {
4762    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4763    memcpy(cmd_buffer->push_constants + offset, pValues, size);
4764    cmd_buffer->push_constant_stages |= stageFlags;
4765 }
4766 
4767 VkResult
radv_EndCommandBuffer(VkCommandBuffer commandBuffer)4768 radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
4769 {
4770    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4771 
4772    radv_emit_mip_change_flush_default(cmd_buffer);
4773 
4774    if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
4775       if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6)
4776          cmd_buffer->state.flush_bits |=
4777             RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
4778 
4779       /* Make sure to sync all pending active queries at the end of
4780        * command buffer.
4781        */
4782       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
4783 
4784       /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
4785        * command buffer.
4786        */
4787       if (cmd_buffer->state.rb_noncoherent_dirty && can_skip_buffer_l2_flushes(cmd_buffer->device))
4788          cmd_buffer->state.flush_bits |= radv_src_access_flush(
4789             cmd_buffer,
4790             VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
4791             NULL);
4792 
4793       /* Since NGG streamout uses GDS, we need to make GDS idle when
4794        * we leave the IB, otherwise another process might overwrite
4795        * it while our shaders are busy.
4796        */
4797       if (cmd_buffer->gds_needed)
4798          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
4799 
4800       si_emit_cache_flush(cmd_buffer);
4801    }
4802 
4803    /* Make sure CP DMA is idle at the end of IBs because the kernel
4804     * doesn't wait for it.
4805     */
4806    si_cp_dma_wait_for_idle(cmd_buffer);
4807 
4808    radv_describe_end_cmd_buffer(cmd_buffer);
4809 
4810    vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
4811    vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
4812 
4813    VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
4814    if (result != VK_SUCCESS)
4815       return vk_error(cmd_buffer, result);
4816 
4817    cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
4818 
4819    return cmd_buffer->record_result;
4820 }
4821 
4822 static void
radv_emit_compute_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)4823 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
4824 {
4825    if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
4826       return;
4827 
4828    assert(!pipeline->ctx_cs.cdw);
4829 
4830    cmd_buffer->state.emitted_compute_pipeline = pipeline;
4831 
4832    radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
4833    radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
4834 
4835    cmd_buffer->compute_scratch_size_per_wave_needed =
4836       MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave);
4837    cmd_buffer->compute_scratch_waves_wanted =
4838       MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->max_waves);
4839 
4840    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
4841                       pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
4842 
4843    if (unlikely(cmd_buffer->device->trace_bo))
4844       radv_save_pipeline(cmd_buffer, pipeline);
4845 }
4846 
4847 static void
radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)4848 radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
4849 {
4850    struct radv_descriptor_state *descriptors_state =
4851       radv_get_descriptors_state(cmd_buffer, bind_point);
4852 
4853    descriptors_state->dirty |= descriptors_state->valid;
4854 }
4855 
4856 void
radv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)4857 radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
4858                      VkPipeline _pipeline)
4859 {
4860    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4861    RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
4862 
4863    switch (pipelineBindPoint) {
4864    case VK_PIPELINE_BIND_POINT_COMPUTE:
4865       if (cmd_buffer->state.compute_pipeline == pipeline)
4866          return;
4867       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4868 
4869       cmd_buffer->state.compute_pipeline = pipeline;
4870       cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
4871       break;
4872    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
4873       if (cmd_buffer->state.rt_pipeline == pipeline)
4874          return;
4875       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4876 
4877       cmd_buffer->state.rt_pipeline = pipeline;
4878       cmd_buffer->push_constant_stages |=
4879          (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
4880           VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR |
4881           VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR);
4882       radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size);
4883       break;
4884    case VK_PIPELINE_BIND_POINT_GRAPHICS:
4885       if (cmd_buffer->state.pipeline == pipeline)
4886          return;
4887       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4888 
4889       bool vtx_emit_count_changed =
4890          !pipeline || !cmd_buffer->state.pipeline ||
4891          cmd_buffer->state.pipeline->graphics.vtx_emit_num != pipeline->graphics.vtx_emit_num ||
4892          cmd_buffer->state.pipeline->graphics.vtx_base_sgpr != pipeline->graphics.vtx_base_sgpr;
4893       cmd_buffer->state.pipeline = pipeline;
4894       if (!pipeline)
4895          break;
4896 
4897       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
4898       cmd_buffer->push_constant_stages |= pipeline->active_stages;
4899 
4900       /* the new vertex shader might not have the same user regs */
4901       if (vtx_emit_count_changed) {
4902          cmd_buffer->state.last_first_instance = -1;
4903          cmd_buffer->state.last_vertex_offset = -1;
4904          cmd_buffer->state.last_drawid = -1;
4905       }
4906 
4907       /* Prefetch all pipeline shaders at first draw time. */
4908       cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
4909 
4910       if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug &&
4911           cmd_buffer->state.emitted_pipeline &&
4912           cmd_buffer->state.emitted_pipeline->graphics.is_ngg &&
4913           !cmd_buffer->state.pipeline->graphics.is_ngg) {
4914          /* Transitioning from NGG to legacy GS requires
4915           * VGT_FLUSH on GFX10 and Sienna Cichlid. VGT_FLUSH
4916           * is also emitted at the beginning of IBs when legacy
4917           * GS ring pointers are set.
4918           */
4919          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
4920       }
4921 
4922       radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
4923       radv_bind_streamout_state(cmd_buffer, pipeline);
4924 
4925       if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
4926          cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
4927       if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
4928          cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
4929 
4930       if (radv_pipeline_has_tess(pipeline))
4931          cmd_buffer->tess_rings_needed = true;
4932       break;
4933    default:
4934       assert(!"invalid bind point");
4935       break;
4936    }
4937 }
4938 
4939 void
radv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)4940 radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
4941                     const VkViewport *pViewports)
4942 {
4943    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4944    struct radv_cmd_state *state = &cmd_buffer->state;
4945    ASSERTED const uint32_t total_count = firstViewport + viewportCount;
4946 
4947    assert(firstViewport < MAX_VIEWPORTS);
4948    assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
4949 
4950    if (total_count <= state->dynamic.viewport.count &&
4951        !memcmp(state->dynamic.viewport.viewports + firstViewport, pViewports,
4952                viewportCount * sizeof(*pViewports))) {
4953       return;
4954    }
4955 
4956    if (state->dynamic.viewport.count < total_count)
4957       state->dynamic.viewport.count = total_count;
4958 
4959    memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
4960           viewportCount * sizeof(*pViewports));
4961    for (unsigned i = 0; i < viewportCount; i++) {
4962       radv_get_viewport_xform(&pViewports[i],
4963                               state->dynamic.viewport.xform[i + firstViewport].scale,
4964                               state->dynamic.viewport.xform[i + firstViewport].translate);
4965    }
4966 
4967    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
4968 }
4969 
4970 void
radv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)4971 radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
4972                    const VkRect2D *pScissors)
4973 {
4974    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4975    struct radv_cmd_state *state = &cmd_buffer->state;
4976    ASSERTED const uint32_t total_count = firstScissor + scissorCount;
4977 
4978    assert(firstScissor < MAX_SCISSORS);
4979    assert(total_count >= 1 && total_count <= MAX_SCISSORS);
4980 
4981    if (total_count <= state->dynamic.scissor.count &&
4982        !memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
4983                scissorCount * sizeof(*pScissors))) {
4984       return;
4985    }
4986 
4987    if (state->dynamic.scissor.count < total_count)
4988       state->dynamic.scissor.count = total_count;
4989 
4990    memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
4991           scissorCount * sizeof(*pScissors));
4992 
4993    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
4994 }
4995 
4996 void
radv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)4997 radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
4998 {
4999    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5000 
5001    if (cmd_buffer->state.dynamic.line_width == lineWidth)
5002       return;
5003 
5004    cmd_buffer->state.dynamic.line_width = lineWidth;
5005    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
5006 }
5007 
5008 void
radv_CmdSetDepthBias(VkCommandBuffer commandBuffer,float depthBiasConstantFactor,float depthBiasClamp,float depthBiasSlopeFactor)5009 radv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor,
5010                      float depthBiasClamp, float depthBiasSlopeFactor)
5011 {
5012    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5013    struct radv_cmd_state *state = &cmd_buffer->state;
5014 
5015    if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
5016        state->dynamic.depth_bias.clamp == depthBiasClamp &&
5017        state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
5018       return;
5019    }
5020 
5021    state->dynamic.depth_bias.bias = depthBiasConstantFactor;
5022    state->dynamic.depth_bias.clamp = depthBiasClamp;
5023    state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
5024 
5025    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
5026 }
5027 
5028 void
radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])5029 radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
5030 {
5031    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5032    struct radv_cmd_state *state = &cmd_buffer->state;
5033 
5034    if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4))
5035       return;
5036 
5037    memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
5038 
5039    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
5040 }
5041 
5042 void
radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)5043 radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
5044 {
5045    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5046    struct radv_cmd_state *state = &cmd_buffer->state;
5047 
5048    if (state->dynamic.depth_bounds.min == minDepthBounds &&
5049        state->dynamic.depth_bounds.max == maxDepthBounds) {
5050       return;
5051    }
5052 
5053    state->dynamic.depth_bounds.min = minDepthBounds;
5054    state->dynamic.depth_bounds.max = maxDepthBounds;
5055 
5056    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
5057 }
5058 
5059 void
radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)5060 radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5061                               uint32_t compareMask)
5062 {
5063    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5064    struct radv_cmd_state *state = &cmd_buffer->state;
5065    bool front_same = state->dynamic.stencil_compare_mask.front == compareMask;
5066    bool back_same = state->dynamic.stencil_compare_mask.back == compareMask;
5067 
5068    if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
5069        (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
5070       return;
5071    }
5072 
5073    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5074       state->dynamic.stencil_compare_mask.front = compareMask;
5075    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5076       state->dynamic.stencil_compare_mask.back = compareMask;
5077 
5078    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
5079 }
5080 
5081 void
radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)5082 radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5083                             uint32_t writeMask)
5084 {
5085    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5086    struct radv_cmd_state *state = &cmd_buffer->state;
5087    bool front_same = state->dynamic.stencil_write_mask.front == writeMask;
5088    bool back_same = state->dynamic.stencil_write_mask.back == writeMask;
5089 
5090    if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
5091        (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
5092       return;
5093    }
5094 
5095    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5096       state->dynamic.stencil_write_mask.front = writeMask;
5097    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5098       state->dynamic.stencil_write_mask.back = writeMask;
5099 
5100    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
5101 }
5102 
5103 void
radv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)5104 radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5105                             uint32_t reference)
5106 {
5107    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5108    struct radv_cmd_state *state = &cmd_buffer->state;
5109    bool front_same = state->dynamic.stencil_reference.front == reference;
5110    bool back_same = state->dynamic.stencil_reference.back == reference;
5111 
5112    if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
5113        (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
5114       return;
5115    }
5116 
5117    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5118       cmd_buffer->state.dynamic.stencil_reference.front = reference;
5119    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5120       cmd_buffer->state.dynamic.stencil_reference.back = reference;
5121 
5122    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
5123 }
5124 
5125 void
radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer,uint32_t firstDiscardRectangle,uint32_t discardRectangleCount,const VkRect2D * pDiscardRectangles)5126 radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
5127                                uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
5128 {
5129    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5130    struct radv_cmd_state *state = &cmd_buffer->state;
5131    ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
5132 
5133    assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
5134    assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
5135 
5136    if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle,
5137                pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) {
5138       return;
5139    }
5140 
5141    typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
5142                 pDiscardRectangles, discardRectangleCount);
5143 
5144    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
5145 }
5146 
5147 void
radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,const VkSampleLocationsInfoEXT * pSampleLocationsInfo)5148 radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
5149                               const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
5150 {
5151    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5152    struct radv_cmd_state *state = &cmd_buffer->state;
5153 
5154    assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
5155 
5156    state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
5157    state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
5158    state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
5159    typed_memcpy(&state->dynamic.sample_location.locations[0],
5160                 pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount);
5161 
5162    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
5163 }
5164 
5165 void
radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)5166 radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor,
5167                           uint16_t lineStipplePattern)
5168 {
5169    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5170    struct radv_cmd_state *state = &cmd_buffer->state;
5171 
5172    if (state->dynamic.line_stipple.factor == lineStippleFactor &&
5173        state->dynamic.line_stipple.pattern == lineStipplePattern)
5174       return;
5175 
5176    state->dynamic.line_stipple.factor = lineStippleFactor;
5177    state->dynamic.line_stipple.pattern = lineStipplePattern;
5178 
5179    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
5180 }
5181 
5182 void
radv_CmdSetCullModeEXT(VkCommandBuffer commandBuffer,VkCullModeFlags cullMode)5183 radv_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
5184 {
5185    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5186    struct radv_cmd_state *state = &cmd_buffer->state;
5187 
5188    if (state->dynamic.cull_mode == cullMode)
5189       return;
5190 
5191    state->dynamic.cull_mode = cullMode;
5192 
5193    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
5194 }
5195 
5196 void
radv_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer,VkFrontFace frontFace)5197 radv_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
5198 {
5199    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5200    struct radv_cmd_state *state = &cmd_buffer->state;
5201 
5202    if (state->dynamic.front_face == frontFace)
5203       return;
5204 
5205    state->dynamic.front_face = frontFace;
5206 
5207    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
5208 }
5209 
5210 void
radv_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,VkPrimitiveTopology primitiveTopology)5211 radv_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,
5212                                 VkPrimitiveTopology primitiveTopology)
5213 {
5214    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5215    struct radv_cmd_state *state = &cmd_buffer->state;
5216    unsigned primitive_topology = si_translate_prim(primitiveTopology);
5217 
5218    if (state->dynamic.primitive_topology == primitive_topology)
5219       return;
5220 
5221    state->dynamic.primitive_topology = primitive_topology;
5222 
5223    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
5224 }
5225 
5226 void
radv_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)5227 radv_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer, uint32_t viewportCount,
5228                                 const VkViewport *pViewports)
5229 {
5230    radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
5231 }
5232 
5233 void
radv_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer,uint32_t scissorCount,const VkRect2D * pScissors)5234 radv_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer, uint32_t scissorCount,
5235                                const VkRect2D *pScissors)
5236 {
5237    radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
5238 }
5239 
5240 void
radv_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthTestEnable)5241 radv_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
5242 
5243 {
5244    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5245    struct radv_cmd_state *state = &cmd_buffer->state;
5246 
5247    if (state->dynamic.depth_test_enable == depthTestEnable)
5248       return;
5249 
5250    state->dynamic.depth_test_enable = depthTestEnable;
5251 
5252    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
5253 }
5254 
5255 void
radv_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthWriteEnable)5256 radv_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
5257 {
5258    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5259    struct radv_cmd_state *state = &cmd_buffer->state;
5260 
5261    if (state->dynamic.depth_write_enable == depthWriteEnable)
5262       return;
5263 
5264    state->dynamic.depth_write_enable = depthWriteEnable;
5265 
5266    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
5267 }
5268 
5269 void
radv_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer,VkCompareOp depthCompareOp)5270 radv_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
5271 {
5272    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5273    struct radv_cmd_state *state = &cmd_buffer->state;
5274 
5275    if (state->dynamic.depth_compare_op == depthCompareOp)
5276       return;
5277 
5278    state->dynamic.depth_compare_op = depthCompareOp;
5279 
5280    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
5281 }
5282 
5283 void
radv_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthBoundsTestEnable)5284 radv_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
5285 {
5286    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5287    struct radv_cmd_state *state = &cmd_buffer->state;
5288 
5289    if (state->dynamic.depth_bounds_test_enable == depthBoundsTestEnable)
5290       return;
5291 
5292    state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
5293 
5294    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
5295 }
5296 
5297 void
radv_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer,VkBool32 stencilTestEnable)5298 radv_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
5299 {
5300    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5301    struct radv_cmd_state *state = &cmd_buffer->state;
5302 
5303    if (state->dynamic.stencil_test_enable == stencilTestEnable)
5304       return;
5305 
5306    state->dynamic.stencil_test_enable = stencilTestEnable;
5307 
5308    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
5309 }
5310 
5311 void
radv_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,VkStencilOp failOp,VkStencilOp passOp,VkStencilOp depthFailOp,VkCompareOp compareOp)5312 radv_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5313                         VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp,
5314                         VkCompareOp compareOp)
5315 {
5316    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5317    struct radv_cmd_state *state = &cmd_buffer->state;
5318    bool front_same = state->dynamic.stencil_op.front.fail_op == failOp &&
5319                      state->dynamic.stencil_op.front.pass_op == passOp &&
5320                      state->dynamic.stencil_op.front.depth_fail_op == depthFailOp &&
5321                      state->dynamic.stencil_op.front.compare_op == compareOp;
5322    bool back_same = state->dynamic.stencil_op.back.fail_op == failOp &&
5323                     state->dynamic.stencil_op.back.pass_op == passOp &&
5324                     state->dynamic.stencil_op.back.depth_fail_op == depthFailOp &&
5325                     state->dynamic.stencil_op.back.compare_op == compareOp;
5326 
5327    if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
5328        (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same))
5329       return;
5330 
5331    if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
5332       state->dynamic.stencil_op.front.fail_op = failOp;
5333       state->dynamic.stencil_op.front.pass_op = passOp;
5334       state->dynamic.stencil_op.front.depth_fail_op = depthFailOp;
5335       state->dynamic.stencil_op.front.compare_op = compareOp;
5336    }
5337 
5338    if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
5339       state->dynamic.stencil_op.back.fail_op = failOp;
5340       state->dynamic.stencil_op.back.pass_op = passOp;
5341       state->dynamic.stencil_op.back.depth_fail_op = depthFailOp;
5342       state->dynamic.stencil_op.back.compare_op = compareOp;
5343    }
5344 
5345    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
5346 }
5347 
5348 void
radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer,const VkExtent2D * pFragmentSize,const VkFragmentShadingRateCombinerOpKHR combinerOps[2])5349 radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
5350                                   const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
5351 {
5352    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5353    struct radv_cmd_state *state = &cmd_buffer->state;
5354 
5355    if (state->dynamic.fragment_shading_rate.size.width == pFragmentSize->width &&
5356        state->dynamic.fragment_shading_rate.size.height == pFragmentSize->height &&
5357        state->dynamic.fragment_shading_rate.combiner_ops[0] == combinerOps[0] &&
5358        state->dynamic.fragment_shading_rate.combiner_ops[1] == combinerOps[1])
5359       return;
5360 
5361    state->dynamic.fragment_shading_rate.size = *pFragmentSize;
5362    for (unsigned i = 0; i < 2; i++)
5363       state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i];
5364 
5365    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
5366 }
5367 
5368 void
radv_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthBiasEnable)5369 radv_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
5370 {
5371    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5372    struct radv_cmd_state *state = &cmd_buffer->state;
5373 
5374    if (state->dynamic.depth_bias_enable == depthBiasEnable)
5375       return;
5376 
5377    state->dynamic.depth_bias_enable = depthBiasEnable;
5378 
5379    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
5380 }
5381 
5382 void
radv_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer,VkBool32 primitiveRestartEnable)5383 radv_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
5384 {
5385    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5386    struct radv_cmd_state *state = &cmd_buffer->state;
5387 
5388    if (state->dynamic.primitive_restart_enable == primitiveRestartEnable)
5389       return;
5390 
5391    state->dynamic.primitive_restart_enable = primitiveRestartEnable;
5392 
5393    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
5394 }
5395 
5396 void
radv_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer,VkBool32 rasterizerDiscardEnable)5397 radv_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer,
5398                                       VkBool32 rasterizerDiscardEnable)
5399 {
5400    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5401    struct radv_cmd_state *state = &cmd_buffer->state;
5402 
5403    if (state->dynamic.rasterizer_discard_enable == rasterizerDiscardEnable)
5404       return;
5405 
5406    state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable;
5407 
5408    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
5409 }
5410 
5411 void
radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,uint32_t patchControlPoints)5412 radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
5413 {
5414    /* not implemented */
5415 }
5416 
5417 void
radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,VkLogicOp logicOp)5418 radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
5419 {
5420    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5421    struct radv_cmd_state *state = &cmd_buffer->state;
5422    unsigned logic_op = si_translate_blend_logic_op(logicOp);
5423 
5424    if (state->dynamic.logic_op == logic_op)
5425       return;
5426 
5427    state->dynamic.logic_op = logic_op;
5428 
5429    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
5430 }
5431 
5432 void
radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)5433 radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
5434                                const VkBool32 *pColorWriteEnables)
5435 {
5436    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5437    struct radv_cmd_state *state = &cmd_buffer->state;
5438    uint32_t color_write_enable = 0;
5439 
5440    assert(attachmentCount < MAX_RTS);
5441 
5442    for (uint32_t i = 0; i < attachmentCount; i++) {
5443       color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
5444    }
5445 
5446    if (state->dynamic.color_write_enable == color_write_enable)
5447       return;
5448 
5449    state->dynamic.color_write_enable = color_write_enable;
5450 
5451    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
5452 }
5453 
5454 void
radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer,uint32_t vertexBindingDescriptionCount,const VkVertexInputBindingDescription2EXT * pVertexBindingDescriptions,uint32_t vertexAttributeDescriptionCount,const VkVertexInputAttributeDescription2EXT * pVertexAttributeDescriptions)5455 radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
5456                           const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
5457                           uint32_t vertexAttributeDescriptionCount,
5458                           const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
5459 {
5460    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5461    struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
5462 
5463    const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
5464    for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
5465       bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
5466 
5467    cmd_buffer->state.vbo_misaligned_mask = 0;
5468 
5469    memset(state, 0, sizeof(*state));
5470 
5471    enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
5472    for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
5473       const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
5474       const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
5475       unsigned loc = attrib->location;
5476       const struct util_format_description *format_desc = vk_format_description(attrib->format);
5477       unsigned nfmt, dfmt;
5478       bool post_shuffle;
5479       enum radv_vs_input_alpha_adjust alpha_adjust;
5480 
5481       state->attribute_mask |= 1u << loc;
5482       state->bindings[loc] = attrib->binding;
5483       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
5484          state->instance_rate_inputs |= 1u << loc;
5485          state->divisors[loc] = binding->divisor;
5486          if (binding->divisor != 1)
5487             state->nontrivial_divisors |= 1u << loc;
5488       }
5489       cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
5490       state->offsets[loc] = attrib->offset;
5491 
5492       radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc,
5493                                    &dfmt, &nfmt, &post_shuffle, &alpha_adjust);
5494 
5495       state->formats[loc] = dfmt | (nfmt << 4);
5496       const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 :
5497                                                (format_desc->block.bits / 8u - 1);
5498       state->format_align_req_minus_1[loc] = format_align_req_minus_1;
5499       state->format_sizes[loc] = format_desc->block.bits / 8u;
5500 
5501       if (chip == GFX6 || chip >= GFX10) {
5502          struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
5503          unsigned bit = 1u << loc;
5504          if (binding->stride & format_align_req_minus_1) {
5505             state->misaligned_mask |= bit;
5506             if (cmd_buffer->state.vbo_bound_mask & bit)
5507                cmd_buffer->state.vbo_misaligned_mask |= bit;
5508          } else {
5509             state->possibly_misaligned_mask |= bit;
5510             if (cmd_buffer->state.vbo_bound_mask & bit &&
5511                 ((vb[attrib->binding].offset + state->offsets[loc]) & format_align_req_minus_1))
5512                cmd_buffer->state.vbo_misaligned_mask |= bit;
5513          }
5514       }
5515 
5516       if (alpha_adjust) {
5517          state->alpha_adjust_lo |= (alpha_adjust & 0x1) << loc;
5518          state->alpha_adjust_hi |= (alpha_adjust >> 1) << loc;
5519       }
5520 
5521       if (post_shuffle)
5522          state->post_shuffle |= 1u << loc;
5523    }
5524 
5525    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
5526                               RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
5527 }
5528 
5529 void
radv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCmdBuffers)5530 radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount,
5531                         const VkCommandBuffer *pCmdBuffers)
5532 {
5533    RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
5534 
5535    assert(commandBufferCount > 0);
5536 
5537    radv_emit_mip_change_flush_default(primary);
5538 
5539    /* Emit pending flushes on primary prior to executing secondary */
5540    si_emit_cache_flush(primary);
5541 
5542    /* Make sure CP DMA is idle on primary prior to executing secondary. */
5543    si_cp_dma_wait_for_idle(primary);
5544 
5545    for (uint32_t i = 0; i < commandBufferCount; i++) {
5546       RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
5547       bool allow_ib2 = true;
5548 
5549       if (secondary->device->physical_device->rad_info.chip_class == GFX7 &&
5550           secondary->state.uses_draw_indirect_multi) {
5551          /* Do not launch an IB2 for secondary command buffers that contain
5552           * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU.
5553           */
5554          allow_ib2 = false;
5555       }
5556 
5557       primary->scratch_size_per_wave_needed =
5558          MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
5559       primary->scratch_waves_wanted =
5560          MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
5561       primary->compute_scratch_size_per_wave_needed =
5562          MAX2(primary->compute_scratch_size_per_wave_needed,
5563               secondary->compute_scratch_size_per_wave_needed);
5564       primary->compute_scratch_waves_wanted =
5565          MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
5566 
5567       if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
5568          primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
5569       if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
5570          primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
5571       if (secondary->tess_rings_needed)
5572          primary->tess_rings_needed = true;
5573       if (secondary->sample_positions_needed)
5574          primary->sample_positions_needed = true;
5575       if (secondary->gds_needed)
5576          primary->gds_needed = true;
5577 
5578       if (!secondary->state.framebuffer && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
5579          /* Emit the framebuffer state from primary if secondary
5580           * has been recorded without a framebuffer, otherwise
5581           * fast color/depth clears can't work.
5582           */
5583          radv_emit_fb_mip_change_flush(primary);
5584          radv_emit_framebuffer_state(primary);
5585       }
5586 
5587       primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
5588 
5589       /* When the secondary command buffer is compute only we don't
5590        * need to re-emit the current graphics pipeline.
5591        */
5592       if (secondary->state.emitted_pipeline) {
5593          primary->state.emitted_pipeline = secondary->state.emitted_pipeline;
5594       }
5595 
5596       /* When the secondary command buffer is graphics only we don't
5597        * need to re-emit the current compute pipeline.
5598        */
5599       if (secondary->state.emitted_compute_pipeline) {
5600          primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
5601       }
5602 
5603       /* Only re-emit the draw packets when needed. */
5604       if (secondary->state.last_primitive_reset_en != -1) {
5605          primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en;
5606       }
5607 
5608       if (secondary->state.last_primitive_reset_index) {
5609          primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
5610       }
5611 
5612       if (secondary->state.last_ia_multi_vgt_param) {
5613          primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
5614       }
5615 
5616       primary->state.last_first_instance = secondary->state.last_first_instance;
5617       primary->state.last_num_instances = secondary->state.last_num_instances;
5618       primary->state.last_drawid = secondary->state.last_drawid;
5619       primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
5620       primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
5621       primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
5622       primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
5623 
5624       if (secondary->state.last_index_type != -1) {
5625          primary->state.last_index_type = secondary->state.last_index_type;
5626       }
5627 
5628       primary->state.last_nggc_settings = secondary->state.last_nggc_settings;
5629       primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx;
5630       primary->state.last_nggc_skip = secondary->state.last_nggc_skip;
5631    }
5632 
5633    /* After executing commands from secondary buffers we have to dirty
5634     * some states.
5635     */
5636    primary->state.dirty |=
5637       RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_ALL;
5638    radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
5639    radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
5640 }
5641 
5642 VkResult
radv_CreateCommandPool(VkDevice _device,const VkCommandPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkCommandPool * pCmdPool)5643 radv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo,
5644                        const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool)
5645 {
5646    RADV_FROM_HANDLE(radv_device, device, _device);
5647    struct radv_cmd_pool *pool;
5648 
5649    pool =
5650       vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
5651    if (pool == NULL)
5652       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
5653 
5654    vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_COMMAND_POOL);
5655 
5656    if (pAllocator)
5657       pool->alloc = *pAllocator;
5658    else
5659       pool->alloc = device->vk.alloc;
5660 
5661    list_inithead(&pool->cmd_buffers);
5662    list_inithead(&pool->free_cmd_buffers);
5663 
5664    pool->queue_family_index = pCreateInfo->queueFamilyIndex;
5665 
5666    *pCmdPool = radv_cmd_pool_to_handle(pool);
5667 
5668    return VK_SUCCESS;
5669 }
5670 
5671 void
radv_DestroyCommandPool(VkDevice _device,VkCommandPool commandPool,const VkAllocationCallbacks * pAllocator)5672 radv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool,
5673                         const VkAllocationCallbacks *pAllocator)
5674 {
5675    RADV_FROM_HANDLE(radv_device, device, _device);
5676    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
5677 
5678    if (!pool)
5679       return;
5680 
5681    list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
5682    {
5683       radv_destroy_cmd_buffer(cmd_buffer);
5684    }
5685 
5686    list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
5687    {
5688       radv_destroy_cmd_buffer(cmd_buffer);
5689    }
5690 
5691    vk_object_base_finish(&pool->base);
5692    vk_free2(&device->vk.alloc, pAllocator, pool);
5693 }
5694 
5695 VkResult
radv_ResetCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolResetFlags flags)5696 radv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags)
5697 {
5698    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
5699    VkResult result;
5700 
5701    list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
5702    {
5703       result = radv_reset_cmd_buffer(cmd_buffer);
5704       if (result != VK_SUCCESS)
5705          return result;
5706    }
5707 
5708    return VK_SUCCESS;
5709 }
5710 
5711 void
radv_TrimCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolTrimFlags flags)5712 radv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags)
5713 {
5714    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
5715 
5716    if (!pool)
5717       return;
5718 
5719    list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
5720    {
5721       radv_destroy_cmd_buffer(cmd_buffer);
5722    }
5723 }
5724 
5725 static void
radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer * cmd_buffer,uint32_t subpass_id)5726 radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id)
5727 {
5728    struct radv_cmd_state *state = &cmd_buffer->state;
5729    struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
5730 
5731    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096);
5732 
5733    radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier);
5734 
5735    radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
5736 
5737    radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
5738 
5739    for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
5740       const uint32_t a = subpass->attachments[i].attachment;
5741       if (a == VK_ATTACHMENT_UNUSED)
5742          continue;
5743 
5744       radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true);
5745    }
5746 
5747    if (subpass->vrs_attachment) {
5748       int idx = subpass->vrs_attachment->attachment;
5749       struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview;
5750 
5751       if (subpass->depth_stencil_attachment) {
5752          /* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to
5753           * copy the VRS rates to the HTILE buffer of the attachment.
5754           */
5755          int ds_idx = subpass->depth_stencil_attachment->attachment;
5756          struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview;
5757          struct radv_image *ds_image = ds_iview->image;
5758 
5759          VkExtent2D extent = {
5760             .width = ds_image->info.width,
5761             .height = ds_image->info.height,
5762          };
5763 
5764          /* HTILE buffer */
5765          uint64_t htile_offset = ds_image->offset + ds_image->planes[0].surface.meta_offset;
5766          uint64_t htile_size = ds_image->planes[0].surface.meta_slice_size;
5767          struct radv_buffer htile_buffer;
5768 
5769          radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bo, htile_size, htile_offset);
5770 
5771          /* Copy the VRS rates to the HTILE buffer. */
5772          radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true);
5773 
5774          radv_buffer_finish(&htile_buffer);
5775       } else {
5776          /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have
5777           * to copy the VRS rates to our internal HTILE buffer.
5778           */
5779          struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
5780          struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
5781 
5782          if (ds_image) {
5783             /* HTILE buffer */
5784             struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
5785 
5786             VkExtent2D extent = {
5787                .width = MIN2(fb->width, ds_image->info.width),
5788                .height = MIN2(fb->height, ds_image->info.height),
5789             };
5790 
5791             /* Copy the VRS rates to the HTILE buffer. */
5792             radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false);
5793          }
5794       }
5795    }
5796 
5797    radv_describe_barrier_end(cmd_buffer);
5798 
5799    radv_cmd_buffer_clear_subpass(cmd_buffer);
5800 
5801    assert(cmd_buffer->cs->cdw <= cdw_max);
5802 }
5803 
5804 static void
radv_mark_noncoherent_rb(struct radv_cmd_buffer * cmd_buffer)5805 radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
5806 {
5807    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
5808 
5809    /* Have to be conservative in cmdbuffers with inherited attachments. */
5810    if (!cmd_buffer->state.attachments) {
5811       cmd_buffer->state.rb_noncoherent_dirty = true;
5812       return;
5813    }
5814 
5815    for (uint32_t i = 0; i < subpass->color_count; ++i) {
5816       const uint32_t a = subpass->color_attachments[i].attachment;
5817       if (a == VK_ATTACHMENT_UNUSED)
5818          continue;
5819       if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
5820          cmd_buffer->state.rb_noncoherent_dirty = true;
5821          return;
5822       }
5823    }
5824    if (subpass->depth_stencil_attachment &&
5825        !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
5826            .iview->image->l2_coherent)
5827       cmd_buffer->state.rb_noncoherent_dirty = true;
5828 }
5829 
5830 void
radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass * subpass)5831 radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
5832                                 const struct radv_subpass *subpass)
5833 {
5834    radv_mark_noncoherent_rb(cmd_buffer);
5835    radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
5836 }
5837 
5838 static void
radv_cmd_buffer_end_subpass(struct radv_cmd_buffer * cmd_buffer)5839 radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
5840 {
5841    struct radv_cmd_state *state = &cmd_buffer->state;
5842    const struct radv_subpass *subpass = state->subpass;
5843    uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
5844 
5845    radv_cmd_buffer_resolve_subpass(cmd_buffer);
5846 
5847    radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
5848 
5849    for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
5850       const uint32_t a = subpass->attachments[i].attachment;
5851       if (a == VK_ATTACHMENT_UNUSED)
5852          continue;
5853 
5854       if (state->pass->attachments[a].last_subpass_idx != subpass_id)
5855          continue;
5856 
5857       VkImageLayout layout = state->pass->attachments[a].final_layout;
5858       VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout;
5859       struct radv_subpass_attachment att = {a, layout, stencil_layout};
5860       radv_handle_subpass_image_transition(cmd_buffer, att, false);
5861    }
5862 
5863    radv_describe_barrier_end(cmd_buffer);
5864 }
5865 
5866 void
radv_cmd_buffer_begin_render_pass(struct radv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin,const struct radv_extra_render_pass_begin_info * extra_info)5867 radv_cmd_buffer_begin_render_pass(struct radv_cmd_buffer *cmd_buffer,
5868                                   const VkRenderPassBeginInfo *pRenderPassBegin,
5869                                   const struct radv_extra_render_pass_begin_info *extra_info)
5870 {
5871    RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
5872    RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
5873    VkResult result;
5874 
5875    cmd_buffer->state.framebuffer = framebuffer;
5876    cmd_buffer->state.pass = pass;
5877    cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
5878 
5879    result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin, extra_info);
5880    if (result != VK_SUCCESS)
5881       return;
5882 
5883    result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBegin);
5884    if (result != VK_SUCCESS)
5885       return;
5886 }
5887 
5888 void
radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)5889 radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
5890                          const VkRenderPassBeginInfo *pRenderPassBeginInfo,
5891                          const VkSubpassBeginInfo *pSubpassBeginInfo)
5892 {
5893    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5894 
5895    radv_cmd_buffer_begin_render_pass(cmd_buffer, pRenderPassBeginInfo, NULL);
5896 
5897    radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
5898 }
5899 
5900 void
radv_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)5901 radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo,
5902                      const VkSubpassEndInfo *pSubpassEndInfo)
5903 {
5904    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5905 
5906    radv_mark_noncoherent_rb(cmd_buffer);
5907 
5908    uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
5909    radv_cmd_buffer_end_subpass(cmd_buffer);
5910    radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
5911 }
5912 
5913 static void
radv_emit_view_index(struct radv_cmd_buffer * cmd_buffer,unsigned index)5914 radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
5915 {
5916    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
5917    for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
5918       if (!radv_get_shader(pipeline, stage))
5919          continue;
5920 
5921       struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX);
5922       if (loc->sgpr_idx == -1)
5923          continue;
5924       uint32_t base_reg = pipeline->user_data_0[stage];
5925       radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
5926    }
5927    if (radv_pipeline_has_gs_copy_shader(pipeline)) {
5928       struct radv_userdata_info *loc =
5929          &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
5930       if (loc->sgpr_idx != -1) {
5931          uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
5932          radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
5933       }
5934    }
5935 }
5936 
5937 static void
radv_cs_emit_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_count,uint32_t use_opaque)5938 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count,
5939                          uint32_t use_opaque)
5940 {
5941    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
5942    radeon_emit(cmd_buffer->cs, vertex_count);
5943    radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
5944 }
5945 
5946 /**
5947  * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
5948  *
5949  * The starting address "index_va" may point anywhere within the index buffer. The number of
5950  * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
5951  * Hardware uses this information to return 0 for out-of-bounds reads.
5952  */
5953 static void
radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t index_va,uint32_t max_index_count,uint32_t index_count,bool not_eop)5954 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va,
5955                                  uint32_t max_index_count, uint32_t index_count, bool not_eop)
5956 {
5957    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
5958    radeon_emit(cmd_buffer->cs, max_index_count);
5959    radeon_emit(cmd_buffer->cs, index_va);
5960    radeon_emit(cmd_buffer->cs, index_va >> 32);
5961    radeon_emit(cmd_buffer->cs, index_count);
5962    /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
5963     * can be changed between draws and GS fast launch must be disabled.
5964     * NOT_EOP doesn't work on gfx9 and older.
5965     */
5966    radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
5967 }
5968 
5969 /* MUST inline this function to avoid massive perf loss in drawoverhead */
5970 ALWAYS_INLINE static void
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer * cmd_buffer,bool indexed,uint32_t draw_count,uint64_t count_va,uint32_t stride)5971 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed,
5972                                   uint32_t draw_count, uint64_t count_va, uint32_t stride)
5973 {
5974    struct radeon_cmdbuf *cs = cmd_buffer->cs;
5975    const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
5976    bool draw_id_enable = cmd_buffer->state.pipeline->graphics.uses_drawid;
5977    uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
5978    uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
5979    bool predicating = cmd_buffer->state.predicating;
5980    assert(base_reg);
5981 
5982    /* just reset draw state for vertex data */
5983    cmd_buffer->state.last_first_instance = -1;
5984    cmd_buffer->state.last_num_instances = -1;
5985    cmd_buffer->state.last_drawid = -1;
5986    cmd_buffer->state.last_vertex_offset = -1;
5987 
5988    vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
5989    if (cmd_buffer->state.pipeline->graphics.uses_baseinstance)
5990       start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
5991    if (draw_id_enable)
5992       draw_id_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
5993 
5994    if (draw_count == 1 && !count_va && !draw_id_enable) {
5995       radeon_emit(cs,
5996                   PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
5997       radeon_emit(cs, 0);
5998       radeon_emit(cs, vertex_offset_reg);
5999       radeon_emit(cs, start_instance_reg);
6000       radeon_emit(cs, di_src_sel);
6001    } else {
6002       radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
6003                            predicating));
6004       radeon_emit(cs, 0);
6005       radeon_emit(cs, vertex_offset_reg);
6006       radeon_emit(cs, start_instance_reg);
6007       radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
6008                          S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
6009       radeon_emit(cs, draw_count); /* count */
6010       radeon_emit(cs, count_va);   /* count_addr */
6011       radeon_emit(cs, count_va >> 32);
6012       radeon_emit(cs, stride); /* stride */
6013       radeon_emit(cs, di_src_sel);
6014 
6015       cmd_buffer->state.uses_draw_indirect_multi = true;
6016    }
6017 }
6018 
6019 static inline void
radv_emit_userdata_vertex_internal(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)6020 radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer,
6021                                    const struct radv_draw_info *info, const uint32_t vertex_offset)
6022 {
6023    struct radv_cmd_state *state = &cmd_buffer->state;
6024    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6025    const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance;
6026    const bool uses_drawid = state->pipeline->graphics.uses_drawid;
6027    radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
6028                          state->pipeline->graphics.vtx_emit_num);
6029 
6030    radeon_emit(cs, vertex_offset);
6031    state->last_vertex_offset = vertex_offset;
6032    if (uses_drawid) {
6033       radeon_emit(cs, 0);
6034       state->last_drawid = 0;
6035    }
6036    if (uses_baseinstance) {
6037       radeon_emit(cs, info->first_instance);
6038       state->last_first_instance = info->first_instance;
6039    }
6040 }
6041 
6042 ALWAYS_INLINE static void
radv_emit_userdata_vertex(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)6043 radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6044                           const uint32_t vertex_offset)
6045 {
6046    const struct radv_cmd_state *state = &cmd_buffer->state;
6047    const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance;
6048    const bool uses_drawid = state->pipeline->graphics.uses_drawid;
6049 
6050    /* this looks very dumb, but it allows the compiler to optimize better and yields
6051     * ~3-4% perf increase in drawoverhead
6052     */
6053    if (vertex_offset != state->last_vertex_offset) {
6054       radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6055    } else if (uses_drawid && 0 != state->last_drawid) {
6056       radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6057    } else if (uses_baseinstance && info->first_instance != state->last_first_instance) {
6058       radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6059    }
6060 }
6061 
6062 ALWAYS_INLINE static void
radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_offset,uint32_t drawid)6063 radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
6064 {
6065    struct radv_cmd_state *state = &cmd_buffer->state;
6066    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6067    radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr, 1 + !!drawid);
6068    radeon_emit(cs, vertex_offset);
6069    state->last_vertex_offset = vertex_offset;
6070    if (drawid)
6071       radeon_emit(cs, drawid);
6072 
6073 }
6074 
6075 ALWAYS_INLINE static void
radv_emit_draw_packets_indexed(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * minfo,uint32_t stride,const int32_t * vertexOffset)6076 radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer,
6077                                const struct radv_draw_info *info,
6078                                uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo,
6079                                uint32_t stride,
6080                                const int32_t *vertexOffset)
6081 
6082 {
6083    struct radv_cmd_state *state = &cmd_buffer->state;
6084    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6085    const int index_size = radv_get_vgt_index_size(state->index_type);
6086    unsigned i = 0;
6087    const bool uses_drawid = state->pipeline->graphics.uses_drawid;
6088    const bool can_eop = !uses_drawid && cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10;
6089 
6090    if (uses_drawid) {
6091       if (vertexOffset) {
6092          radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
6093          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6094             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6095 
6096             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6097             if (!remaining_indexes &&
6098                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6099                continue;
6100 
6101             if (i > 0)
6102                radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i);
6103 
6104             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6105 
6106             if (!state->subpass->view_mask) {
6107                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6108             } else {
6109                u_foreach_bit(view, state->subpass->view_mask) {
6110                   radv_emit_view_index(cmd_buffer, view);
6111 
6112                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6113                }
6114             }
6115          }
6116       } else {
6117          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6118             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6119 
6120             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6121             if (!remaining_indexes &&
6122                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6123                continue;
6124 
6125             if (i > 0) {
6126                if (state->last_vertex_offset != draw->vertexOffset)
6127                   radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
6128                else
6129                   radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i);
6130             } else
6131                radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
6132 
6133             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6134 
6135             if (!state->subpass->view_mask) {
6136                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6137             } else {
6138                u_foreach_bit(view, state->subpass->view_mask) {
6139                   radv_emit_view_index(cmd_buffer, view);
6140 
6141                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6142                }
6143             }
6144          }
6145       }
6146       if (drawCount > 1) {
6147          state->last_drawid = drawCount - 1;
6148       }
6149    } else {
6150       if (vertexOffset) {
6151          if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX10) {
6152             /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
6153              * count == 0 for the last draw that doesn't have NOT_EOP.
6154              */
6155             while (drawCount > 1) {
6156                const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride);
6157                if (last->indexCount)
6158                   break;
6159                drawCount--;
6160             }
6161          }
6162 
6163          radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
6164          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6165             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6166 
6167             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6168             if (!remaining_indexes &&
6169                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6170                continue;
6171 
6172             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6173 
6174             if (!state->subpass->view_mask) {
6175                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1);
6176             } else {
6177                u_foreach_bit(view, state->subpass->view_mask) {
6178                   radv_emit_view_index(cmd_buffer, view);
6179 
6180                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6181                }
6182             }
6183          }
6184       } else {
6185          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6186             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6187 
6188             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6189             if (!remaining_indexes &&
6190                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6191                continue;
6192 
6193             const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL);
6194             const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
6195             radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
6196 
6197             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6198 
6199             if (!state->subpass->view_mask) {
6200                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1);
6201             } else {
6202                u_foreach_bit(view, state->subpass->view_mask) {
6203                   radv_emit_view_index(cmd_buffer, view);
6204 
6205                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6206                }
6207             }
6208          }
6209       }
6210       if (drawCount > 1) {
6211          state->last_drawid = drawCount - 1;
6212       }
6213    }
6214 }
6215 
6216 ALWAYS_INLINE static void
radv_emit_direct_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawInfoEXT * minfo,uint32_t use_opaque,uint32_t stride)6217 radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6218                               uint32_t drawCount, const VkMultiDrawInfoEXT *minfo,
6219                               uint32_t use_opaque, uint32_t stride)
6220 {
6221    unsigned i = 0;
6222    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
6223    const bool uses_drawid = cmd_buffer->state.pipeline->graphics.uses_drawid;
6224    uint32_t last_start = 0;
6225 
6226    vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) {
6227       if (!i)
6228          radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
6229       else
6230          radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
6231 
6232       if (!view_mask) {
6233          radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
6234       } else {
6235          u_foreach_bit(view, view_mask) {
6236             radv_emit_view_index(cmd_buffer, view);
6237             radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
6238          }
6239       }
6240       last_start = draw->firstVertex;
6241    }
6242    if (drawCount > 1) {
6243        struct radv_cmd_state *state = &cmd_buffer->state;
6244        state->last_vertex_offset = last_start;
6245        if (uses_drawid)
6246            state->last_drawid = drawCount - 1;
6247    }
6248 }
6249 
6250 static void
radv_emit_indirect_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)6251 radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer,
6252                                 const struct radv_draw_info *info)
6253 {
6254    const struct radv_cmd_state *state = &cmd_buffer->state;
6255    struct radeon_winsys *ws = cmd_buffer->device->ws;
6256    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6257    const uint64_t va =
6258       radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
6259    const uint64_t count_va = info->count_buffer
6260                                 ? radv_buffer_get_va(info->count_buffer->bo) +
6261                                      info->count_buffer->offset + info->count_buffer_offset
6262                                 : 0;
6263 
6264    radv_cs_add_buffer(ws, cs, info->indirect->bo);
6265 
6266    radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
6267    radeon_emit(cs, 1);
6268    radeon_emit(cs, va);
6269    radeon_emit(cs, va >> 32);
6270 
6271    if (info->count_buffer) {
6272       radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
6273    }
6274 
6275    if (!state->subpass->view_mask) {
6276       radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
6277                                         info->stride);
6278    } else {
6279       u_foreach_bit(i, state->subpass->view_mask)
6280       {
6281          radv_emit_view_index(cmd_buffer, i);
6282 
6283          radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
6284                                            info->stride);
6285       }
6286    }
6287 }
6288 
6289 /*
6290  * Vega and raven have a bug which triggers if there are multiple context
6291  * register contexts active at the same time with different scissor values.
6292  *
6293  * There are two possible workarounds:
6294  * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
6295  *    there is only ever 1 active set of scissor values at the same time.
6296  *
6297  * 2) Whenever the hardware switches contexts we have to set the scissor
6298  *    registers again even if it is a noop. That way the new context gets
6299  *    the correct scissor values.
6300  *
6301  * This implements option 2. radv_need_late_scissor_emission needs to
6302  * return true on affected HW if radv_emit_all_graphics_states sets
6303  * any context registers.
6304  */
6305 static bool
radv_need_late_scissor_emission(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)6306 radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
6307                                 const struct radv_draw_info *info)
6308 {
6309    struct radv_cmd_state *state = &cmd_buffer->state;
6310 
6311    if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
6312       return false;
6313 
6314    if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
6315       return true;
6316 
6317    uint64_t used_states =
6318       cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
6319 
6320    /* Index, vertex and streamout buffers don't change context regs, and
6321     * pipeline is already handled.
6322     */
6323    used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
6324                     RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT | RADV_CMD_DIRTY_STREAMOUT_BUFFER |
6325                     RADV_CMD_DIRTY_PIPELINE);
6326 
6327    if (cmd_buffer->state.dirty & used_states)
6328       return true;
6329 
6330    uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
6331 
6332    if (info->indexed && state->dynamic.primitive_restart_enable &&
6333        primitive_reset_index != state->last_primitive_reset_index)
6334       return true;
6335 
6336    return false;
6337 }
6338 
6339 enum {
6340    ngg_cull_none = 0,
6341    ngg_cull_front_face = 1,
6342    ngg_cull_back_face = 2,
6343    ngg_cull_face_is_ccw = 4,
6344    ngg_cull_small_primitives = 8,
6345 };
6346 
6347 ALWAYS_INLINE static bool
radv_skip_ngg_culling(bool has_tess,const unsigned vtx_cnt,bool indirect)6348 radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
6349                       bool indirect)
6350 {
6351    /* If we have to draw only a few vertices, we get better latency if
6352     * we disable NGG culling.
6353     *
6354     * When tessellation is used, what matters is the number of tessellated
6355     * vertices, so let's always assume it's not a small draw.
6356     */
6357    return !has_tess && !indirect && vtx_cnt < 128;
6358 }
6359 
6360 ALWAYS_INLINE static uint32_t
radv_get_ngg_culling_settings(struct radv_cmd_buffer * cmd_buffer,bool vp_y_inverted)6361 radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
6362 {
6363    const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
6364    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
6365 
6366    /* Cull every triangle when rasterizer discard is enabled. */
6367    if (d->rasterizer_discard_enable ||
6368        G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl))
6369       return ngg_cull_front_face | ngg_cull_back_face;
6370 
6371    uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
6372    uint32_t nggc_settings = ngg_cull_none;
6373 
6374    /* The culling code needs to know whether face is CW or CCW. */
6375    bool ccw = (pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE)
6376               ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE
6377               : G_028814_FACE(pa_su_sc_mode_cntl) == 0;
6378 
6379    /* Take inverted viewport into account. */
6380    ccw ^= vp_y_inverted;
6381 
6382    if (ccw)
6383       nggc_settings |= ngg_cull_face_is_ccw;
6384 
6385    /* Face culling settings. */
6386    if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
6387          ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT)
6388          : G_028814_CULL_FRONT(pa_su_sc_mode_cntl))
6389       nggc_settings |= ngg_cull_front_face;
6390    if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
6391          ? (d->cull_mode & VK_CULL_MODE_BACK_BIT)
6392          : G_028814_CULL_BACK(pa_su_sc_mode_cntl))
6393       nggc_settings |= ngg_cull_back_face;
6394 
6395    /* Small primitive culling is only valid when conservative overestimation is not used. */
6396    if (!pipeline->graphics.uses_conservative_overestimate) {
6397       nggc_settings |= ngg_cull_small_primitives;
6398 
6399       /* small_prim_precision = num_samples / 2^subpixel_bits
6400        * num_samples is also always a power of two, so the small prim precision can only be
6401        * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
6402        */
6403       unsigned subpixel_bits = 256;
6404       int32_t small_prim_precision_log2 = util_logbase2(pipeline->graphics.ms.num_samples) - util_logbase2(subpixel_bits);
6405       nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u);
6406    }
6407 
6408    return nggc_settings;
6409 }
6410 
6411 static void
radv_emit_ngg_culling_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)6412 radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
6413 {
6414    struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
6415    const unsigned stage = pipeline->graphics.last_vgt_api_stage;
6416    const bool nggc_supported = pipeline->graphics.has_ngg_culling;
6417 
6418    if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) {
6419       /* Current shader doesn't support culling and culling was already disabled:
6420        * No further steps needed, just remember the SGPR's location is not set.
6421        */
6422       cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
6423       return;
6424    }
6425 
6426    /* Check dirty flags:
6427     * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed).
6428     * - Dirty dynamic flags: culling settings may have changed.
6429     */
6430    const bool dirty =
6431       cmd_buffer->state.dirty &
6432       (RADV_CMD_DIRTY_PIPELINE |
6433        RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
6434        RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT);
6435 
6436    /* Check small draw status:
6437     * For small draw calls, we disable culling by setting the SGPR to 0.
6438     */
6439    const bool skip =
6440       radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect);
6441 
6442    /* See if anything changed. */
6443    if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
6444       return;
6445 
6446    /* Remember small draw state. */
6447    cmd_buffer->state.last_nggc_skip = skip;
6448    const struct radv_shader_variant *v = pipeline->shaders[stage];
6449    assert(v->info.has_ngg_culling == nggc_supported);
6450 
6451    /* Find the user SGPR. */
6452    const uint32_t base_reg = pipeline->user_data_0[stage];
6453    const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx;
6454    assert(!nggc_supported || nggc_sgpr_idx != -1);
6455 
6456    /* Get viewport transform. */
6457    float vp_scale[2], vp_translate[2];
6458    memcpy(vp_scale, cmd_buffer->state.dynamic.viewport.xform[0].scale, 2 * sizeof(float));
6459    memcpy(vp_translate, cmd_buffer->state.dynamic.viewport.xform[0].translate, 2 * sizeof(float));
6460    bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
6461 
6462    /* Get current culling settings. */
6463    uint32_t nggc_settings = nggc_supported && !skip
6464                             ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted)
6465                             : ngg_cull_none;
6466 
6467    bool emit_viewport = nggc_settings &&
6468                         (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT ||
6469                          cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx ||
6470                          !cmd_buffer->state.last_nggc_settings);
6471 
6472    if (emit_viewport) {
6473       /* Correction for inverted Y */
6474       if (vp_y_inverted) {
6475          vp_scale[1] = -vp_scale[1];
6476          vp_translate[1] = -vp_translate[1];
6477       }
6478 
6479       /* Correction for number of samples per pixel. */
6480       for (unsigned i = 0; i < 2; ++i) {
6481          vp_scale[i] *= (float) pipeline->graphics.ms.num_samples;
6482          vp_translate[i] *= (float) pipeline->graphics.ms.num_samples;
6483       }
6484 
6485       uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
6486       const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx;
6487       assert(vp_sgpr_idx != -1);
6488       radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
6489       radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
6490    }
6491 
6492    bool emit_settings = nggc_supported &&
6493                         (cmd_buffer->state.last_nggc_settings != nggc_settings ||
6494                          cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx);
6495 
6496    /* This needs to be emitted when culling is turned on
6497     * and when it's already on but some settings change.
6498     */
6499    if (emit_settings) {
6500       assert(nggc_sgpr_idx >= 0);
6501       radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
6502    }
6503 
6504    /* These only need to be emitted when culling is turned on or off,
6505     * but not when it stays on and just some settings change.
6506     */
6507    if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) {
6508       uint32_t rsrc2 = v->config.rsrc2;
6509 
6510       if (!nggc_settings) {
6511          /* Allocate less LDS when culling is disabled. (But GS always needs it.) */
6512          if (stage != MESA_SHADER_GEOMETRY)
6513             rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling);
6514       }
6515 
6516       /* When the pipeline is dirty and not yet emitted, don't write it here
6517        * because radv_emit_graphics_pipeline will overwrite this register.
6518        */
6519       if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
6520           cmd_buffer->state.emitted_pipeline == pipeline) {
6521          radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
6522       }
6523    }
6524 
6525    cmd_buffer->state.last_nggc_settings = nggc_settings;
6526    cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx;
6527 }
6528 
6529 static void
radv_emit_all_graphics_states(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,bool pipeline_is_dirty)6530 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6531                               bool pipeline_is_dirty)
6532 {
6533    bool late_scissor_emission;
6534 
6535    if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
6536        cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
6537       radv_emit_rbplus_state(cmd_buffer);
6538 
6539    if (cmd_buffer->device->physical_device->use_ngg_culling &&
6540        cmd_buffer->state.pipeline->graphics.is_ngg)
6541       radv_emit_ngg_culling_state(cmd_buffer, info);
6542 
6543    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
6544       radv_emit_graphics_pipeline(cmd_buffer);
6545 
6546    /* This should be before the cmd_buffer->state.dirty is cleared
6547     * (excluding RADV_CMD_DIRTY_PIPELINE) and after
6548     * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
6549    late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info);
6550 
6551    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
6552       radv_emit_framebuffer_state(cmd_buffer);
6553 
6554    if (info->indexed) {
6555       if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
6556          radv_emit_index_buffer(cmd_buffer, info->indirect);
6557    } else {
6558       /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
6559        * so the state must be re-emitted before the next indexed
6560        * draw.
6561        */
6562       if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
6563          cmd_buffer->state.last_index_type = -1;
6564          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
6565       }
6566    }
6567 
6568    radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty);
6569 
6570    radv_emit_draw_registers(cmd_buffer, info);
6571 
6572    if (late_scissor_emission)
6573       radv_emit_scissor(cmd_buffer);
6574 }
6575 
6576 /* MUST inline this function to avoid massive perf loss in drawoverhead */
6577 ALWAYS_INLINE static bool
radv_before_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount)6578 radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount)
6579 {
6580    const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
6581    const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
6582                                   cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
6583 
6584    ASSERTED const unsigned cdw_max =
6585       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
6586 
6587    if (likely(!info->indirect)) {
6588       /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
6589        * no workaround for indirect draws, but we can at least skip
6590        * direct draws.
6591        */
6592       if (unlikely(!info->instance_count))
6593          return false;
6594 
6595       /* Handle count == 0. */
6596       if (unlikely(!info->count && !info->strmout_buffer))
6597          return false;
6598    }
6599 
6600    /* Need to apply this workaround early as it can set flush flags. */
6601    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
6602       radv_emit_fb_mip_change_flush(cmd_buffer);
6603 
6604    /* Use optimal packet order based on whether we need to sync the
6605     * pipeline.
6606     */
6607    if (cmd_buffer->state.flush_bits &
6608        (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
6609         RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
6610       /* If we have to wait for idle, set all states first, so that
6611        * all SET packets are processed in parallel with previous draw
6612        * calls. Then upload descriptors, set shader pointers, and
6613        * draw, and prefetch at the end. This ensures that the time
6614        * the CUs are idle is very short. (there are only SET_SH
6615        * packets between the wait and the draw)
6616        */
6617       radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
6618       si_emit_cache_flush(cmd_buffer);
6619       /* <-- CUs are idle here --> */
6620 
6621       radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
6622    } else {
6623       /* If we don't wait for idle, start prefetches first, then set
6624        * states, and draw at the end.
6625        */
6626       si_emit_cache_flush(cmd_buffer);
6627 
6628       if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
6629          /* Only prefetch the vertex shader and VBO descriptors
6630           * in order to start the draw as soon as possible.
6631           */
6632          radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, true);
6633       }
6634 
6635       radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
6636 
6637       radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
6638    }
6639 
6640    radv_describe_draw(cmd_buffer);
6641    if (likely(!info->indirect)) {
6642       struct radv_cmd_state *state = &cmd_buffer->state;
6643       struct radeon_cmdbuf *cs = cmd_buffer->cs;
6644       assert(state->pipeline->graphics.vtx_base_sgpr);
6645       if (state->last_num_instances != info->instance_count) {
6646          radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
6647          radeon_emit(cs, info->instance_count);
6648          state->last_num_instances = info->instance_count;
6649       }
6650    }
6651    assert(cmd_buffer->cs->cdw <= cdw_max);
6652 
6653    return true;
6654 }
6655 
6656 static void
radv_after_draw(struct radv_cmd_buffer * cmd_buffer)6657 radv_after_draw(struct radv_cmd_buffer *cmd_buffer)
6658 {
6659    const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
6660    bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
6661    /* Start prefetches after the draw has been started. Both will
6662     * run in parallel, but starting the draw first is more
6663     * important.
6664     */
6665    if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
6666       radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, false);
6667    }
6668 
6669    /* Workaround for a VGT hang when streamout is enabled.
6670     * It must be done after drawing.
6671     */
6672    if (cmd_buffer->state.streamout.streamout_enabled &&
6673        (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA ||
6674         rad_info->family == CHIP_FIJI)) {
6675       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
6676    }
6677 
6678    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
6679 }
6680 
6681 void
radv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)6682 radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
6683              uint32_t firstVertex, uint32_t firstInstance)
6684 {
6685    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6686    struct radv_draw_info info;
6687 
6688    info.count = vertexCount;
6689    info.instance_count = instanceCount;
6690    info.first_instance = firstInstance;
6691    info.strmout_buffer = NULL;
6692    info.indirect = NULL;
6693    info.indexed = false;
6694 
6695    if (!radv_before_draw(cmd_buffer, &info, 1))
6696       return;
6697    const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount };
6698    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
6699    radv_after_draw(cmd_buffer);
6700 }
6701 
6702 void
radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)6703 radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
6704                           uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
6705 {
6706    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6707    struct radv_draw_info info;
6708 
6709    if (!drawCount)
6710       return;
6711 
6712    info.count = pVertexInfo->vertexCount;
6713    info.instance_count = instanceCount;
6714    info.first_instance = firstInstance;
6715    info.strmout_buffer = NULL;
6716    info.indirect = NULL;
6717    info.indexed = false;
6718 
6719    if (!radv_before_draw(cmd_buffer, &info, drawCount))
6720       return;
6721    radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
6722    radv_after_draw(cmd_buffer);
6723 }
6724 
6725 void
radv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)6726 radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
6727                     uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)
6728 {
6729    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6730    struct radv_draw_info info;
6731 
6732    info.indexed = true;
6733    info.count = indexCount;
6734    info.instance_count = instanceCount;
6735    info.first_instance = firstInstance;
6736    info.strmout_buffer = NULL;
6737    info.indirect = NULL;
6738 
6739    if (!radv_before_draw(cmd_buffer, &info, 1))
6740       return;
6741    const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset };
6742    radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
6743    radv_after_draw(cmd_buffer);
6744 }
6745 
radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)6746 void radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo,
6747                                   uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset)
6748 {
6749    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6750    struct radv_draw_info info;
6751 
6752    if (!drawCount)
6753       return;
6754 
6755    const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
6756    info.indexed = true;
6757    info.count = minfo->indexCount;
6758    info.instance_count = instanceCount;
6759    info.first_instance = firstInstance;
6760    info.strmout_buffer = NULL;
6761    info.indirect = NULL;
6762 
6763    if (!radv_before_draw(cmd_buffer, &info, drawCount))
6764       return;
6765    radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
6766    radv_after_draw(cmd_buffer);
6767 }
6768 
6769 void
radv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6770 radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
6771                      uint32_t drawCount, uint32_t stride)
6772 {
6773    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6774    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6775    struct radv_draw_info info;
6776 
6777    info.count = drawCount;
6778    info.indirect = buffer;
6779    info.indirect_offset = offset;
6780    info.stride = stride;
6781    info.strmout_buffer = NULL;
6782    info.count_buffer = NULL;
6783    info.indexed = false;
6784    info.instance_count = 0;
6785 
6786    if (!radv_before_draw(cmd_buffer, &info, 1))
6787       return;
6788    radv_emit_indirect_draw_packets(cmd_buffer, &info);
6789    radv_after_draw(cmd_buffer);
6790 }
6791 
6792 void
radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6793 radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
6794                             uint32_t drawCount, uint32_t stride)
6795 {
6796    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6797    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6798    struct radv_draw_info info;
6799 
6800    info.indexed = true;
6801    info.count = drawCount;
6802    info.indirect = buffer;
6803    info.indirect_offset = offset;
6804    info.stride = stride;
6805    info.count_buffer = NULL;
6806    info.strmout_buffer = NULL;
6807    info.instance_count = 0;
6808 
6809    if (!radv_before_draw(cmd_buffer, &info, 1))
6810       return;
6811    radv_emit_indirect_draw_packets(cmd_buffer, &info);
6812    radv_after_draw(cmd_buffer);
6813 }
6814 
6815 void
radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)6816 radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
6817                           VkBuffer _countBuffer, VkDeviceSize countBufferOffset,
6818                           uint32_t maxDrawCount, uint32_t stride)
6819 {
6820    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6821    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6822    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
6823    struct radv_draw_info info;
6824 
6825    info.count = maxDrawCount;
6826    info.indirect = buffer;
6827    info.indirect_offset = offset;
6828    info.count_buffer = count_buffer;
6829    info.count_buffer_offset = countBufferOffset;
6830    info.stride = stride;
6831    info.strmout_buffer = NULL;
6832    info.indexed = false;
6833    info.instance_count = 0;
6834 
6835    if (!radv_before_draw(cmd_buffer, &info, 1))
6836       return;
6837    radv_emit_indirect_draw_packets(cmd_buffer, &info);
6838    radv_after_draw(cmd_buffer);
6839 }
6840 
6841 void
radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)6842 radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
6843                                  VkDeviceSize offset, VkBuffer _countBuffer,
6844                                  VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
6845                                  uint32_t stride)
6846 {
6847    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6848    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6849    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
6850    struct radv_draw_info info;
6851 
6852    info.indexed = true;
6853    info.count = maxDrawCount;
6854    info.indirect = buffer;
6855    info.indirect_offset = offset;
6856    info.count_buffer = count_buffer;
6857    info.count_buffer_offset = countBufferOffset;
6858    info.stride = stride;
6859    info.strmout_buffer = NULL;
6860    info.instance_count = 0;
6861 
6862    if (!radv_before_draw(cmd_buffer, &info, 1))
6863       return;
6864    radv_emit_indirect_draw_packets(cmd_buffer, &info);
6865    radv_after_draw(cmd_buffer);
6866 }
6867 
6868 struct radv_dispatch_info {
6869    /**
6870     * Determine the layout of the grid (in block units) to be used.
6871     */
6872    uint32_t blocks[3];
6873 
6874    /**
6875     * A starting offset for the grid. If unaligned is set, the offset
6876     * must still be aligned.
6877     */
6878    uint32_t offsets[3];
6879    /**
6880     * Whether it's an unaligned compute dispatch.
6881     */
6882    bool unaligned;
6883 
6884    /**
6885     * Indirect compute parameters resource.
6886     */
6887    struct radeon_winsys_bo *indirect;
6888    uint64_t va;
6889 };
6890 
6891 static void
radv_emit_dispatch_packets(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,const struct radv_dispatch_info * info)6892 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
6893                            const struct radv_dispatch_info *info)
6894 {
6895    struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
6896    unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
6897    struct radeon_winsys *ws = cmd_buffer->device->ws;
6898    bool predicating = cmd_buffer->state.predicating;
6899    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6900    struct radv_userdata_info *loc;
6901 
6902    radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]);
6903 
6904    loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
6905 
6906    ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 25);
6907 
6908    if (compute_shader->info.wave_size == 32) {
6909       assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
6910       dispatch_initiator |= S_00B800_CS_W32_EN(1);
6911    }
6912 
6913    if (info->indirect) {
6914       radv_cs_add_buffer(ws, cs, info->indirect);
6915 
6916       if (loc->sgpr_idx != -1) {
6917          for (unsigned i = 0; i < 3; ++i) {
6918             radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6919             radeon_emit(cs,
6920                         COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG));
6921             radeon_emit(cs, (info->va + 4 * i));
6922             radeon_emit(cs, (info->va + 4 * i) >> 32);
6923             radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i);
6924             radeon_emit(cs, 0);
6925          }
6926       }
6927 
6928       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
6929          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) | PKT3_SHADER_TYPE_S(1));
6930          radeon_emit(cs, info->va);
6931          radeon_emit(cs, info->va >> 32);
6932          radeon_emit(cs, dispatch_initiator);
6933       } else {
6934          radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
6935          radeon_emit(cs, 1);
6936          radeon_emit(cs, info->va);
6937          radeon_emit(cs, info->va >> 32);
6938 
6939          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
6940          radeon_emit(cs, 0);
6941          radeon_emit(cs, dispatch_initiator);
6942       }
6943    } else {
6944       unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
6945       unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
6946 
6947       if (info->unaligned) {
6948          unsigned *cs_block_size = compute_shader->info.cs.block_size;
6949          unsigned remainder[3];
6950 
6951          /* If aligned, these should be an entire block size,
6952           * not 0.
6953           */
6954          remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]);
6955          remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]);
6956          remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]);
6957 
6958          blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
6959          blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
6960          blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
6961 
6962          for (unsigned i = 0; i < 3; ++i) {
6963             assert(offsets[i] % cs_block_size[i] == 0);
6964             offsets[i] /= cs_block_size[i];
6965          }
6966 
6967          radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
6968          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
6969                             S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
6970          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
6971                             S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
6972          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
6973                             S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
6974 
6975          dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
6976       }
6977 
6978       if (loc->sgpr_idx != -1) {
6979          assert(loc->num_sgprs == 3);
6980 
6981          radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
6982          radeon_emit(cs, blocks[0]);
6983          radeon_emit(cs, blocks[1]);
6984          radeon_emit(cs, blocks[2]);
6985       }
6986 
6987       if (offsets[0] || offsets[1] || offsets[2]) {
6988          radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
6989          radeon_emit(cs, offsets[0]);
6990          radeon_emit(cs, offsets[1]);
6991          radeon_emit(cs, offsets[2]);
6992 
6993          /* The blocks in the packet are not counts but end values. */
6994          for (unsigned i = 0; i < 3; ++i)
6995             blocks[i] += offsets[i];
6996       } else {
6997          dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
6998       }
6999 
7000       radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
7001       radeon_emit(cs, blocks[0]);
7002       radeon_emit(cs, blocks[1]);
7003       radeon_emit(cs, blocks[2]);
7004       radeon_emit(cs, dispatch_initiator);
7005    }
7006 
7007    assert(cmd_buffer->cs->cdw <= cdw_max);
7008 }
7009 
7010 static void
radv_upload_compute_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)7011 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer,
7012                                        struct radv_pipeline *pipeline,
7013                                        VkPipelineBindPoint bind_point)
7014 {
7015    radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, pipeline, bind_point);
7016    radv_flush_constants(cmd_buffer,
7017                         bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
7018                            ? RADV_RT_STAGE_BITS
7019                            : VK_SHADER_STAGE_COMPUTE_BIT,
7020                         pipeline, bind_point);
7021 }
7022 
7023 static void
radv_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)7024 radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
7025               struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
7026 {
7027    bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
7028    bool pipeline_is_dirty = pipeline && pipeline != cmd_buffer->state.emitted_compute_pipeline;
7029    bool cs_regalloc_hang = cmd_buffer->device->physical_device->rad_info.has_cs_regalloc_hang_bug &&
7030                            info->blocks[0] * info->blocks[1] * info->blocks[2] > 256;
7031 
7032    if (cs_regalloc_hang)
7033       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
7034                                       RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
7035 
7036    if (cmd_buffer->state.flush_bits &
7037        (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
7038         RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
7039       /* If we have to wait for idle, set all states first, so that
7040        * all SET packets are processed in parallel with previous draw
7041        * calls. Then upload descriptors, set shader pointers, and
7042        * dispatch, and prefetch at the end. This ensures that the
7043        * time the CUs are idle is very short. (there are only SET_SH
7044        * packets between the wait and the draw)
7045        */
7046       radv_emit_compute_pipeline(cmd_buffer, pipeline);
7047       si_emit_cache_flush(cmd_buffer);
7048       /* <-- CUs are idle here --> */
7049 
7050       radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
7051 
7052       radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
7053       /* <-- CUs are busy here --> */
7054 
7055       /* Start prefetches after the dispatch has been started. Both
7056        * will run in parallel, but starting the dispatch first is
7057        * more important.
7058        */
7059       if (has_prefetch && pipeline_is_dirty) {
7060          radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]);
7061       }
7062    } else {
7063       /* If we don't wait for idle, start prefetches first, then set
7064        * states, and dispatch at the end.
7065        */
7066       si_emit_cache_flush(cmd_buffer);
7067 
7068       if (has_prefetch && pipeline_is_dirty) {
7069          radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]);
7070       }
7071 
7072       radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
7073 
7074       radv_emit_compute_pipeline(cmd_buffer, pipeline);
7075       radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
7076    }
7077 
7078    if (pipeline_is_dirty) {
7079       /* Raytracing uses compute shaders but has separate bind points and pipelines.
7080        * So if we set compute userdata & shader registers we should dirty the raytracing
7081        * ones and the other way around.
7082        *
7083        * We only need to do this when the pipeline is dirty because when we switch between
7084        * the two we always need to switch pipelines.
7085        */
7086       radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
7087                                                      ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
7088                                                      : VK_PIPELINE_BIND_POINT_COMPUTE);
7089    }
7090 
7091    if (cs_regalloc_hang)
7092       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
7093 
7094    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
7095 }
7096 
7097 static void
radv_compute_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)7098 radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
7099 {
7100    radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline,
7101                  VK_PIPELINE_BIND_POINT_COMPUTE);
7102 }
7103 
7104 void
radv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t base_x,uint32_t base_y,uint32_t base_z,uint32_t x,uint32_t y,uint32_t z)7105 radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y,
7106                      uint32_t base_z, uint32_t x, uint32_t y, uint32_t z)
7107 {
7108    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7109    struct radv_dispatch_info info = {0};
7110 
7111    info.blocks[0] = x;
7112    info.blocks[1] = y;
7113    info.blocks[2] = z;
7114 
7115    info.offsets[0] = base_x;
7116    info.offsets[1] = base_y;
7117    info.offsets[2] = base_z;
7118    radv_compute_dispatch(cmd_buffer, &info);
7119 }
7120 
7121 void
radv_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t x,uint32_t y,uint32_t z)7122 radv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
7123 {
7124    radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
7125 }
7126 
7127 void
radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)7128 radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
7129 {
7130    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7131    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
7132    struct radv_dispatch_info info = {0};
7133 
7134    info.indirect = buffer->bo;
7135    info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
7136 
7137    radv_compute_dispatch(cmd_buffer, &info);
7138 }
7139 
7140 void
radv_unaligned_dispatch(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)7141 radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
7142 {
7143    struct radv_dispatch_info info = {0};
7144 
7145    info.blocks[0] = x;
7146    info.blocks[1] = y;
7147    info.blocks[2] = z;
7148    info.unaligned = 1;
7149 
7150    radv_compute_dispatch(cmd_buffer, &info);
7151 }
7152 
7153 void
radv_indirect_dispatch(struct radv_cmd_buffer * cmd_buffer,struct radeon_winsys_bo * bo,uint64_t va)7154 radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
7155 {
7156    struct radv_dispatch_info info = {0};
7157 
7158    info.indirect = bo;
7159    info.va = va;
7160 
7161    radv_compute_dispatch(cmd_buffer, &info);
7162 }
7163 
7164 static void
radv_rt_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)7165 radv_rt_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
7166 {
7167    radv_dispatch(cmd_buffer, info, cmd_buffer->state.rt_pipeline,
7168                  VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
7169 }
7170 
7171 static bool
radv_rt_bind_tables(struct radv_cmd_buffer * cmd_buffer,const VkStridedDeviceAddressRegionKHR * tables)7172 radv_rt_bind_tables(struct radv_cmd_buffer *cmd_buffer,
7173                     const VkStridedDeviceAddressRegionKHR *tables)
7174 {
7175    struct radv_pipeline *pipeline = cmd_buffer->state.rt_pipeline;
7176    uint32_t base_reg;
7177    void *ptr;
7178    uint32_t *desc_ptr;
7179    uint32_t offset;
7180 
7181    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 64, &offset, &ptr))
7182       return false;
7183 
7184    desc_ptr = ptr;
7185    for (unsigned i = 0; i < 4; ++i, desc_ptr += 4) {
7186       desc_ptr[0] = tables[i].deviceAddress;
7187       desc_ptr[1] = tables[i].deviceAddress >> 32;
7188       desc_ptr[2] = tables[i].stride;
7189       desc_ptr[3] = 0;
7190    }
7191 
7192    uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
7193    struct radv_userdata_info *loc =
7194       radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS);
7195    if (loc->sgpr_idx == -1)
7196       return true;
7197 
7198    base_reg = pipeline->user_data_0[MESA_SHADER_COMPUTE];
7199    radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
7200                             false);
7201    return true;
7202 }
7203 
7204 void
radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,uint32_t width,uint32_t height,uint32_t depth)7205 radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,
7206                      const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
7207                      const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
7208                      const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
7209                      const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
7210                      uint32_t width, uint32_t height, uint32_t depth)
7211 {
7212    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7213    struct radv_dispatch_info info = {0};
7214 
7215    info.blocks[0] = width;
7216    info.blocks[1] = height;
7217    info.blocks[2] = depth;
7218    info.unaligned = 1;
7219 
7220    const VkStridedDeviceAddressRegionKHR tables[] = {
7221       *pRaygenShaderBindingTable,
7222       *pMissShaderBindingTable,
7223       *pHitShaderBindingTable,
7224       *pCallableShaderBindingTable,
7225    };
7226 
7227    if (!radv_rt_bind_tables(cmd_buffer, tables)) {
7228       return;
7229    }
7230 
7231    struct radv_userdata_info *loc = radv_lookup_user_sgpr(
7232       cmd_buffer->state.rt_pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE);
7233 
7234    if (loc->sgpr_idx != -1) {
7235       assert(loc->num_sgprs == 3);
7236 
7237       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
7238       radeon_emit(cmd_buffer->cs, width);
7239       radeon_emit(cmd_buffer->cs, height);
7240       radeon_emit(cmd_buffer->cs, depth);
7241    }
7242 
7243    radv_rt_dispatch(cmd_buffer, &info);
7244 }
7245 
7246 static void
radv_set_rt_stack_size(struct radv_cmd_buffer * cmd_buffer,uint32_t size)7247 radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size)
7248 {
7249    unsigned wave_size = 0;
7250    unsigned scratch_bytes_per_wave = 0;
7251 
7252    if (cmd_buffer->state.rt_pipeline) {
7253       scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->scratch_bytes_per_wave;
7254       wave_size = cmd_buffer->state.rt_pipeline->shaders[MESA_SHADER_COMPUTE]->info.wave_size;
7255    }
7256 
7257    /* The hardware register is specified as a multiple of 256 DWORDS. */
7258    scratch_bytes_per_wave += align(size * wave_size, 1024);
7259 
7260    cmd_buffer->compute_scratch_size_per_wave_needed =
7261       MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
7262 }
7263 
7264 void
radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer,uint32_t size)7265 radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
7266 {
7267    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7268 
7269    radv_set_rt_stack_size(cmd_buffer, size);
7270    cmd_buffer->state.rt_stack_size = size;
7271 }
7272 
7273 void
radv_cmd_buffer_end_render_pass(struct radv_cmd_buffer * cmd_buffer)7274 radv_cmd_buffer_end_render_pass(struct radv_cmd_buffer *cmd_buffer)
7275 {
7276    vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
7277    vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
7278 
7279    cmd_buffer->state.pass = NULL;
7280    cmd_buffer->state.subpass = NULL;
7281    cmd_buffer->state.attachments = NULL;
7282    cmd_buffer->state.framebuffer = NULL;
7283    cmd_buffer->state.subpass_sample_locs = NULL;
7284 }
7285 
7286 void
radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)7287 radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo)
7288 {
7289    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7290 
7291    radv_mark_noncoherent_rb(cmd_buffer);
7292 
7293    radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
7294 
7295    radv_cmd_buffer_end_subpass(cmd_buffer);
7296 
7297    radv_cmd_buffer_end_render_pass(cmd_buffer);
7298 }
7299 
7300 /*
7301  * For HTILE we have the following interesting clear words:
7302  *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
7303  *   0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
7304  *   0xfffffff0: Clear depth to 1.0
7305  *   0x00000000: Clear depth to 0.0
7306  */
7307 static void
radv_initialize_htile(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)7308 radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7309                       const VkImageSubresourceRange *range)
7310 {
7311    struct radv_cmd_state *state = &cmd_buffer->state;
7312    uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
7313    VkClearDepthStencilValue value = {0};
7314    struct radv_barrier_data barrier = {0};
7315 
7316    barrier.layout_transitions.init_mask_ram = 1;
7317    radv_describe_layout_transition(cmd_buffer, &barrier);
7318 
7319    /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
7320     * in considering previous rendering work for WAW hazards. */
7321    state->flush_bits |=
7322       radv_src_access_flush(cmd_buffer, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
7323 
7324    if (image->planes[0].surface.has_stencil &&
7325        !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
7326       /* Flush caches before performing a separate aspect initialization because it's a
7327        * read-modify-write operation.
7328        */
7329       state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_SHADER_READ_BIT, image);
7330    }
7331 
7332    state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
7333 
7334    radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
7335 
7336    if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
7337       /* Initialize the TC-compat metada value to 0 because by
7338        * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
7339        * need have to conditionally update its value when performing
7340        * a fast depth clear.
7341        */
7342       radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
7343    }
7344 }
7345 
7346 static void
radv_handle_depth_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)7347 radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7348                                    VkImageLayout src_layout, bool src_render_loop,
7349                                    VkImageLayout dst_layout, bool dst_render_loop,
7350                                    unsigned src_queue_mask, unsigned dst_queue_mask,
7351                                    const VkImageSubresourceRange *range,
7352                                    struct radv_sample_locations_state *sample_locs)
7353 {
7354    struct radv_device *device = cmd_buffer->device;
7355 
7356    if (!radv_htile_enabled(image, range->baseMipLevel))
7357       return;
7358 
7359    if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
7360       radv_initialize_htile(cmd_buffer, image, range);
7361    } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
7362                                                src_queue_mask) &&
7363               radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
7364                                               dst_queue_mask)) {
7365       radv_initialize_htile(cmd_buffer, image, range);
7366    } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
7367                                               src_queue_mask) &&
7368               !radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
7369                                                dst_queue_mask)) {
7370       cmd_buffer->state.flush_bits |=
7371          RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
7372 
7373       radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
7374 
7375       cmd_buffer->state.flush_bits |=
7376          RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
7377    }
7378 }
7379 
7380 static uint32_t
radv_init_cmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)7381 radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7382                 const VkImageSubresourceRange *range, uint32_t value)
7383 {
7384    struct radv_barrier_data barrier = {0};
7385 
7386    barrier.layout_transitions.init_mask_ram = 1;
7387    radv_describe_layout_transition(cmd_buffer, &barrier);
7388 
7389    return radv_clear_cmask(cmd_buffer, image, range, value);
7390 }
7391 
7392 uint32_t
radv_init_fmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)7393 radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7394                 const VkImageSubresourceRange *range)
7395 {
7396    static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
7397    uint32_t log2_samples = util_logbase2(image->info.samples);
7398    uint32_t value = fmask_clear_values[log2_samples];
7399    struct radv_barrier_data barrier = {0};
7400 
7401    barrier.layout_transitions.init_mask_ram = 1;
7402    radv_describe_layout_transition(cmd_buffer, &barrier);
7403 
7404    return radv_clear_fmask(cmd_buffer, image, range, value);
7405 }
7406 
7407 uint32_t
radv_init_dcc(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)7408 radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7409               const VkImageSubresourceRange *range, uint32_t value)
7410 {
7411    struct radv_barrier_data barrier = {0};
7412    uint32_t flush_bits = 0;
7413    unsigned size = 0;
7414 
7415    barrier.layout_transitions.init_mask_ram = 1;
7416    radv_describe_layout_transition(cmd_buffer, &barrier);
7417 
7418    flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
7419 
7420    if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX8) {
7421       /* When DCC is enabled with mipmaps, some levels might not
7422        * support fast clears and we have to initialize them as "fully
7423        * expanded".
7424        */
7425       /* Compute the size of all fast clearable DCC levels. */
7426       for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
7427          struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
7428          unsigned dcc_fast_clear_size =
7429             dcc_level->dcc_slice_fast_clear_size * image->info.array_size;
7430 
7431          if (!dcc_fast_clear_size)
7432             break;
7433 
7434          size = dcc_level->dcc_offset + dcc_fast_clear_size;
7435       }
7436 
7437       /* Initialize the mipmap levels without DCC. */
7438       if (size != image->planes[0].surface.meta_size) {
7439          flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bo,
7440                                         image->offset + image->planes[0].surface.meta_offset + size,
7441                                         image->planes[0].surface.meta_size - size, 0xffffffff);
7442       }
7443    }
7444 
7445    return flush_bits;
7446 }
7447 
7448 /**
7449  * Initialize DCC/FMASK/CMASK metadata for a color image.
7450  */
7451 static void
radv_init_color_image_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)7452 radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7453                                VkImageLayout src_layout, bool src_render_loop,
7454                                VkImageLayout dst_layout, bool dst_render_loop,
7455                                unsigned src_queue_mask, unsigned dst_queue_mask,
7456                                const VkImageSubresourceRange *range)
7457 {
7458    uint32_t flush_bits = 0;
7459 
7460    /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
7461     * consistent in considering previous rendering work for WAW hazards.
7462     */
7463    cmd_buffer->state.flush_bits |=
7464       radv_src_access_flush(cmd_buffer, VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, image);
7465 
7466    if (radv_image_has_cmask(image)) {
7467       uint32_t value;
7468 
7469       if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
7470          /* TODO: Fix clearing CMASK layers on GFX9. */
7471          if (radv_image_is_tc_compat_cmask(image) ||
7472              (radv_image_has_fmask(image) &&
7473               radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
7474                                          dst_render_loop, dst_queue_mask))) {
7475             value = 0xccccccccu;
7476          } else {
7477             value = 0xffffffffu;
7478          }
7479       } else {
7480          static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
7481          uint32_t log2_samples = util_logbase2(image->info.samples);
7482 
7483          value = cmask_clear_values[log2_samples];
7484       }
7485 
7486       flush_bits |= radv_init_cmask(cmd_buffer, image, range, value);
7487    }
7488 
7489    if (radv_image_has_fmask(image)) {
7490       flush_bits |= radv_init_fmask(cmd_buffer, image, range);
7491    }
7492 
7493    if (radv_dcc_enabled(image, range->baseMipLevel)) {
7494       uint32_t value = 0xffffffffu; /* Fully expanded mode. */
7495 
7496       if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
7497                                      dst_layout, dst_render_loop, dst_queue_mask)) {
7498          value = 0u;
7499       }
7500 
7501       flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
7502    }
7503 
7504    if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
7505       radv_update_fce_metadata(cmd_buffer, image, range, false);
7506 
7507       uint32_t color_values[2] = {0};
7508       radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
7509    }
7510 
7511    cmd_buffer->state.flush_bits |= flush_bits;
7512 }
7513 
7514 static void
radv_retile_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned dst_queue_mask)7515 radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7516                        VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask)
7517 {
7518    if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
7519        (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR ||
7520         (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
7521       radv_retile_dcc(cmd_buffer, image);
7522 }
7523 
7524 static bool
radv_image_need_retile(const struct radv_image * image)7525 radv_image_need_retile(const struct radv_image *image)
7526 {
7527    return image->planes[0].surface.display_dcc_offset &&
7528           image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
7529 }
7530 
7531 /**
7532  * Handle color image transitions for DCC/FMASK/CMASK.
7533  */
7534 static void
radv_handle_color_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)7535 radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7536                                    VkImageLayout src_layout, bool src_render_loop,
7537                                    VkImageLayout dst_layout, bool dst_render_loop,
7538                                    unsigned src_queue_mask, unsigned dst_queue_mask,
7539                                    const VkImageSubresourceRange *range)
7540 {
7541    bool dcc_decompressed = false, fast_clear_flushed = false;
7542 
7543    if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) &&
7544        !radv_dcc_enabled(image, range->baseMipLevel))
7545       return;
7546 
7547    if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
7548       radv_init_color_image_metadata(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
7549                                      dst_render_loop, src_queue_mask, dst_queue_mask, range);
7550 
7551       if (radv_image_need_retile(image))
7552          radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
7553       return;
7554    }
7555 
7556    if (radv_dcc_enabled(image, range->baseMipLevel)) {
7557       if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
7558          cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
7559       } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
7560                                             src_layout, src_render_loop, src_queue_mask) &&
7561                  !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
7562                                              dst_layout, dst_render_loop, dst_queue_mask)) {
7563          radv_decompress_dcc(cmd_buffer, image, range);
7564          dcc_decompressed = true;
7565       } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
7566                                             src_layout, src_render_loop, src_queue_mask) &&
7567                  !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
7568                                              dst_layout, dst_render_loop, dst_queue_mask)) {
7569          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
7570          fast_clear_flushed = true;
7571       }
7572 
7573       if (radv_image_need_retile(image))
7574          radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
7575    } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
7576       if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
7577                                      src_layout, src_render_loop, src_queue_mask) &&
7578           !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
7579                                       dst_layout, dst_render_loop, dst_queue_mask)) {
7580          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
7581          fast_clear_flushed = true;
7582       }
7583    }
7584 
7585    /* MSAA color decompress. */
7586    if (radv_image_has_fmask(image) &&
7587        (image->usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) &&
7588        radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) &&
7589        !radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) {
7590       if (radv_dcc_enabled(image, range->baseMipLevel) &&
7591           !radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) {
7592          /* A DCC decompress is required before expanding FMASK
7593           * when DCC stores aren't supported to avoid being in
7594           * a state where DCC is compressed and the main
7595           * surface is uncompressed.
7596           */
7597          radv_decompress_dcc(cmd_buffer, image, range);
7598       } else if (!fast_clear_flushed) {
7599          /* A FMASK decompress is required before expanding
7600           * FMASK.
7601           */
7602          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
7603       }
7604 
7605       struct radv_barrier_data barrier = {0};
7606       barrier.layout_transitions.fmask_color_expand = 1;
7607       radv_describe_layout_transition(cmd_buffer, &barrier);
7608 
7609       radv_expand_fmask_image_inplace(cmd_buffer, image, range);
7610    }
7611 }
7612 
7613 static void
radv_handle_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,uint32_t src_family,uint32_t dst_family,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)7614 radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7615                              VkImageLayout src_layout, bool src_render_loop,
7616                              VkImageLayout dst_layout, bool dst_render_loop, uint32_t src_family,
7617                              uint32_t dst_family, const VkImageSubresourceRange *range,
7618                              struct radv_sample_locations_state *sample_locs)
7619 {
7620    if (image->exclusive && src_family != dst_family) {
7621       /* This is an acquire or a release operation and there will be
7622        * a corresponding release/acquire. Do the transition in the
7623        * most flexible queue. */
7624 
7625       assert(src_family == cmd_buffer->queue_family_index ||
7626              dst_family == cmd_buffer->queue_family_index);
7627 
7628       if (src_family == VK_QUEUE_FAMILY_EXTERNAL || src_family == VK_QUEUE_FAMILY_FOREIGN_EXT)
7629          return;
7630 
7631       if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
7632          return;
7633 
7634       if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
7635           (src_family == RADV_QUEUE_GENERAL || dst_family == RADV_QUEUE_GENERAL))
7636          return;
7637    }
7638 
7639    unsigned src_queue_mask =
7640       radv_image_queue_family_mask(image, src_family, cmd_buffer->queue_family_index);
7641    unsigned dst_queue_mask =
7642       radv_image_queue_family_mask(image, dst_family, cmd_buffer->queue_family_index);
7643 
7644    if (src_layout == dst_layout && src_render_loop == dst_render_loop && src_queue_mask == dst_queue_mask)
7645       return;
7646 
7647    if (vk_format_has_depth(image->vk_format)) {
7648       radv_handle_depth_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
7649                                          dst_render_loop, src_queue_mask, dst_queue_mask, range,
7650                                          sample_locs);
7651    } else {
7652       radv_handle_color_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
7653                                          dst_render_loop, src_queue_mask, dst_queue_mask, range);
7654    }
7655 }
7656 
7657 struct radv_barrier_info {
7658    enum rgp_barrier_reason reason;
7659    uint32_t eventCount;
7660    const VkEvent *pEvents;
7661    VkPipelineStageFlags srcStageMask;
7662    VkPipelineStageFlags dstStageMask;
7663 };
7664 
7665 static void
radv_barrier(struct radv_cmd_buffer * cmd_buffer,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers,const struct radv_barrier_info * info)7666 radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount,
7667              const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount,
7668              const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount,
7669              const VkImageMemoryBarrier *pImageMemoryBarriers, const struct radv_barrier_info *info)
7670 {
7671    struct radeon_cmdbuf *cs = cmd_buffer->cs;
7672    enum radv_cmd_flush_bits src_flush_bits = 0;
7673    enum radv_cmd_flush_bits dst_flush_bits = 0;
7674 
7675    if (cmd_buffer->state.subpass)
7676       radv_mark_noncoherent_rb(cmd_buffer);
7677 
7678    radv_describe_barrier_start(cmd_buffer, info->reason);
7679 
7680    for (unsigned i = 0; i < info->eventCount; ++i) {
7681       RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]);
7682       uint64_t va = radv_buffer_get_va(event->bo);
7683 
7684       radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
7685 
7686       ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
7687 
7688       radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
7689       assert(cmd_buffer->cs->cdw <= cdw_max);
7690    }
7691 
7692    for (uint32_t i = 0; i < memoryBarrierCount; i++) {
7693       src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask, NULL);
7694       dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask, NULL);
7695    }
7696 
7697    for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
7698       src_flush_bits |=
7699          radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask, NULL);
7700       dst_flush_bits |=
7701          radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask, NULL);
7702    }
7703 
7704    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
7705       RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
7706 
7707       src_flush_bits |=
7708          radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask, image);
7709       dst_flush_bits |=
7710          radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask, image);
7711    }
7712 
7713    /* The Vulkan spec 1.1.98 says:
7714     *
7715     * "An execution dependency with only
7716     *  VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask
7717     *  will only prevent that stage from executing in subsequently
7718     *  submitted commands. As this stage does not perform any actual
7719     *  execution, this is not observable - in effect, it does not delay
7720     *  processing of subsequent commands. Similarly an execution dependency
7721     *  with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask
7722     *  will effectively not wait for any prior commands to complete."
7723     */
7724    if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
7725       radv_stage_flush(cmd_buffer, info->srcStageMask);
7726    cmd_buffer->state.flush_bits |= src_flush_bits;
7727 
7728    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
7729       RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
7730 
7731       const struct VkSampleLocationsInfoEXT *sample_locs_info =
7732          vk_find_struct_const(pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
7733       struct radv_sample_locations_state sample_locations = {0};
7734 
7735       if (sample_locs_info) {
7736          assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
7737          sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
7738          sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
7739          sample_locations.count = sample_locs_info->sampleLocationsCount;
7740          typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
7741                       sample_locs_info->sampleLocationsCount);
7742       }
7743 
7744       radv_handle_image_transition(
7745          cmd_buffer, image, pImageMemoryBarriers[i].oldLayout,
7746          false, /* Outside of a renderpass we are never in a renderloop */
7747          pImageMemoryBarriers[i].newLayout,
7748          false, /* Outside of a renderpass we are never in a renderloop */
7749          pImageMemoryBarriers[i].srcQueueFamilyIndex, pImageMemoryBarriers[i].dstQueueFamilyIndex,
7750          &pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
7751    }
7752 
7753    /* Make sure CP DMA is idle because the driver might have performed a
7754     * DMA operation for copying or filling buffers/images.
7755     */
7756    if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
7757       si_cp_dma_wait_for_idle(cmd_buffer);
7758 
7759    cmd_buffer->state.flush_bits |= dst_flush_bits;
7760 
7761    radv_describe_barrier_end(cmd_buffer);
7762 }
7763 
7764 void
radv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags destStageMask,VkBool32 byRegion,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers)7765 radv_CmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask,
7766                         VkPipelineStageFlags destStageMask, VkBool32 byRegion,
7767                         uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
7768                         uint32_t bufferMemoryBarrierCount,
7769                         const VkBufferMemoryBarrier *pBufferMemoryBarriers,
7770                         uint32_t imageMemoryBarrierCount,
7771                         const VkImageMemoryBarrier *pImageMemoryBarriers)
7772 {
7773    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7774    struct radv_barrier_info info;
7775 
7776    info.reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER;
7777    info.eventCount = 0;
7778    info.pEvents = NULL;
7779    info.srcStageMask = srcStageMask;
7780    info.dstStageMask = destStageMask;
7781 
7782    radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
7783                 pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info);
7784 }
7785 
7786 static void
write_event(struct radv_cmd_buffer * cmd_buffer,struct radv_event * event,VkPipelineStageFlags stageMask,unsigned value)7787 write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event,
7788             VkPipelineStageFlags stageMask, unsigned value)
7789 {
7790    struct radeon_cmdbuf *cs = cmd_buffer->cs;
7791    uint64_t va = radv_buffer_get_va(event->bo);
7792 
7793    si_emit_cache_flush(cmd_buffer);
7794 
7795    radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
7796 
7797    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
7798 
7799    /* Flags that only require a top-of-pipe event. */
7800    VkPipelineStageFlags top_of_pipe_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
7801 
7802    /* Flags that only require a post-index-fetch event. */
7803    VkPipelineStageFlags post_index_fetch_flags =
7804       top_of_pipe_flags | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
7805 
7806    /* Flags that only require signaling post PS. */
7807    VkPipelineStageFlags post_ps_flags =
7808       post_index_fetch_flags | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
7809       VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
7810       VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
7811       VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT |
7812       VK_PIPELINE_STAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR |
7813       VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
7814 
7815    /* Flags that only require signaling post CS. */
7816    VkPipelineStageFlags post_cs_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
7817 
7818    /* Make sure CP DMA is idle because the driver might have performed a
7819     * DMA operation for copying or filling buffers/images.
7820     */
7821    if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
7822       si_cp_dma_wait_for_idle(cmd_buffer);
7823 
7824    if (!(stageMask & ~top_of_pipe_flags)) {
7825       /* Just need to sync the PFP engine. */
7826       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
7827       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
7828       radeon_emit(cs, va);
7829       radeon_emit(cs, va >> 32);
7830       radeon_emit(cs, value);
7831    } else if (!(stageMask & ~post_index_fetch_flags)) {
7832       /* Sync ME because PFP reads index and indirect buffers. */
7833       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
7834       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
7835       radeon_emit(cs, va);
7836       radeon_emit(cs, va >> 32);
7837       radeon_emit(cs, value);
7838    } else {
7839       unsigned event_type;
7840 
7841       if (!(stageMask & ~post_ps_flags)) {
7842          /* Sync previous fragment shaders. */
7843          event_type = V_028A90_PS_DONE;
7844       } else if (!(stageMask & ~post_cs_flags)) {
7845          /* Sync previous compute shaders. */
7846          event_type = V_028A90_CS_DONE;
7847       } else {
7848          /* Otherwise, sync all prior GPU work. */
7849          event_type = V_028A90_BOTTOM_OF_PIPE_TS;
7850       }
7851 
7852       si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
7853                                  radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0,
7854                                  EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value,
7855                                  cmd_buffer->gfx9_eop_bug_va);
7856    }
7857 
7858    assert(cmd_buffer->cs->cdw <= cdw_max);
7859 }
7860 
7861 void
radv_CmdSetEvent(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags stageMask)7862 radv_CmdSetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask)
7863 {
7864    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7865    RADV_FROM_HANDLE(radv_event, event, _event);
7866 
7867    write_event(cmd_buffer, event, stageMask, 1);
7868 }
7869 
7870 void
radv_CmdResetEvent(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags stageMask)7871 radv_CmdResetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask)
7872 {
7873    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7874    RADV_FROM_HANDLE(radv_event, event, _event);
7875 
7876    write_event(cmd_buffer, event, stageMask, 0);
7877 }
7878 
7879 void
radv_CmdWaitEvents(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags dstStageMask,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers)7880 radv_CmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
7881                    VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
7882                    uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
7883                    uint32_t bufferMemoryBarrierCount,
7884                    const VkBufferMemoryBarrier *pBufferMemoryBarriers,
7885                    uint32_t imageMemoryBarrierCount,
7886                    const VkImageMemoryBarrier *pImageMemoryBarriers)
7887 {
7888    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7889    struct radv_barrier_info info;
7890 
7891    info.reason = RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS;
7892    info.eventCount = eventCount;
7893    info.pEvents = pEvents;
7894    info.srcStageMask = 0;
7895 
7896    radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
7897                 pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info);
7898 }
7899 
7900 void
radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,uint32_t deviceMask)7901 radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
7902 {
7903    /* No-op */
7904 }
7905 
7906 /* VK_EXT_conditional_rendering */
7907 void
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)7908 radv_CmdBeginConditionalRenderingEXT(
7909    VkCommandBuffer commandBuffer,
7910    const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
7911 {
7912    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7913    RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
7914    struct radeon_cmdbuf *cs = cmd_buffer->cs;
7915    unsigned pred_op = PREDICATION_OP_BOOL32;
7916    bool draw_visible = true;
7917    uint64_t va;
7918 
7919    va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
7920 
7921    /* By default, if the 32-bit value at offset in buffer memory is zero,
7922     * then the rendering commands are discarded, otherwise they are
7923     * executed as normal. If the inverted flag is set, all commands are
7924     * discarded if the value is non zero.
7925     */
7926    if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
7927       draw_visible = false;
7928    }
7929 
7930    si_emit_cache_flush(cmd_buffer);
7931 
7932    if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL &&
7933        !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
7934       uint64_t pred_value = 0, pred_va;
7935       unsigned pred_offset;
7936 
7937       /* From the Vulkan spec 1.1.107:
7938        *
7939        * "If the 32-bit value at offset in buffer memory is zero,
7940        *  then the rendering commands are discarded, otherwise they
7941        *  are executed as normal. If the value of the predicate in
7942        *  buffer memory changes while conditional rendering is
7943        *  active, the rendering commands may be discarded in an
7944        *  implementation-dependent way. Some implementations may
7945        *  latch the value of the predicate upon beginning conditional
7946        *  rendering while others may read it before every rendering
7947        *  command."
7948        *
7949        * But, the AMD hardware treats the predicate as a 64-bit
7950        * value which means we need a workaround in the driver.
7951        * Luckily, it's not required to support if the value changes
7952        * when predication is active.
7953        *
7954        * The workaround is as follows:
7955        * 1) allocate a 64-value in the upload BO and initialize it
7956        *    to 0
7957        * 2) copy the 32-bit predicate value to the upload BO
7958        * 3) use the new allocated VA address for predication
7959        *
7960        * Based on the conditionalrender demo, it's faster to do the
7961        * COPY_DATA in ME  (+ sync PFP) instead of PFP.
7962        */
7963       radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
7964 
7965       pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
7966 
7967       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
7968       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7969                          COPY_DATA_WR_CONFIRM);
7970       radeon_emit(cs, va);
7971       radeon_emit(cs, va >> 32);
7972       radeon_emit(cs, pred_va);
7973       radeon_emit(cs, pred_va >> 32);
7974 
7975       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
7976       radeon_emit(cs, 0);
7977 
7978       va = pred_va;
7979       pred_op = PREDICATION_OP_BOOL64;
7980    }
7981 
7982    /* Enable predication for this command buffer. */
7983    si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
7984    cmd_buffer->state.predicating = true;
7985 
7986    /* Store conditional rendering user info. */
7987    cmd_buffer->state.predication_type = draw_visible;
7988    cmd_buffer->state.predication_op = pred_op;
7989    cmd_buffer->state.predication_va = va;
7990 }
7991 
7992 void
radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)7993 radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
7994 {
7995    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7996 
7997    /* Disable predication for this command buffer. */
7998    si_emit_set_predication_state(cmd_buffer, false, 0, 0);
7999    cmd_buffer->state.predicating = false;
8000 
8001    /* Reset conditional rendering user info. */
8002    cmd_buffer->state.predication_type = -1;
8003    cmd_buffer->state.predication_op = 0;
8004    cmd_buffer->state.predication_va = 0;
8005 }
8006 
8007 /* VK_EXT_transform_feedback */
8008 void
radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)8009 radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
8010                                         uint32_t bindingCount, const VkBuffer *pBuffers,
8011                                         const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes)
8012 {
8013    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8014    struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
8015    uint8_t enabled_mask = 0;
8016 
8017    assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
8018    for (uint32_t i = 0; i < bindingCount; i++) {
8019       uint32_t idx = firstBinding + i;
8020 
8021       sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
8022       sb[idx].offset = pOffsets[i];
8023 
8024       if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
8025          sb[idx].size = sb[idx].buffer->size - sb[idx].offset;
8026       } else {
8027          sb[idx].size = pSizes[i];
8028       }
8029 
8030       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
8031 
8032       enabled_mask |= 1 << idx;
8033    }
8034 
8035    cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
8036 
8037    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
8038 }
8039 
8040 static void
radv_emit_streamout_enable(struct radv_cmd_buffer * cmd_buffer)8041 radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
8042 {
8043    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8044    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8045 
8046    radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
8047    radeon_emit(cs, S_028B94_STREAMOUT_0_EN(so->streamout_enabled) | S_028B94_RAST_STREAM(0) |
8048                       S_028B94_STREAMOUT_1_EN(so->streamout_enabled) |
8049                       S_028B94_STREAMOUT_2_EN(so->streamout_enabled) |
8050                       S_028B94_STREAMOUT_3_EN(so->streamout_enabled));
8051    radeon_emit(cs, so->hw_enabled_mask & so->enabled_stream_buffers_mask);
8052 
8053    cmd_buffer->state.context_roll_without_scissor_emitted = true;
8054 }
8055 
8056 static void
radv_set_streamout_enable(struct radv_cmd_buffer * cmd_buffer,bool enable)8057 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
8058 {
8059    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8060    bool old_streamout_enabled = so->streamout_enabled;
8061    uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
8062 
8063    so->streamout_enabled = enable;
8064 
8065    so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) |
8066                          (so->enabled_mask << 12);
8067 
8068    if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
8069        ((old_streamout_enabled != so->streamout_enabled) ||
8070         (old_hw_enabled_mask != so->hw_enabled_mask)))
8071       radv_emit_streamout_enable(cmd_buffer);
8072 
8073    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
8074       cmd_buffer->gds_needed = true;
8075       cmd_buffer->gds_oa_needed = true;
8076    }
8077 }
8078 
8079 static void
radv_flush_vgt_streamout(struct radv_cmd_buffer * cmd_buffer)8080 radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
8081 {
8082    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8083    unsigned reg_strmout_cntl;
8084 
8085    /* The register is at different places on different ASICs. */
8086    if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
8087       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
8088       radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
8089    } else {
8090       reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
8091       radeon_set_config_reg(cs, reg_strmout_cntl, 0);
8092    }
8093 
8094    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
8095    radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
8096 
8097    radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
8098    radeon_emit(cs,
8099                WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
8100    radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
8101    radeon_emit(cs, 0);
8102    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
8103    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
8104    radeon_emit(cs, 4);                              /* poll interval */
8105 }
8106 
8107 static void
radv_emit_streamout_begin(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8108 radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
8109                           uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8110                           const VkDeviceSize *pCounterBufferOffsets)
8111 
8112 {
8113    struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
8114    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8115    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8116 
8117    radv_flush_vgt_streamout(cmd_buffer);
8118 
8119    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
8120    u_foreach_bit(i, so->enabled_mask)
8121    {
8122       int32_t counter_buffer_idx = i - firstCounterBuffer;
8123       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
8124          counter_buffer_idx = -1;
8125 
8126       /* AMD GCN binds streamout buffers as shader resources.
8127        * VGT only counts primitives and tells the shader through
8128        * SGPRs what to do.
8129        */
8130       radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
8131       radeon_emit(cs, sb[i].size >> 2);     /* BUFFER_SIZE (in DW) */
8132       radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */
8133 
8134       cmd_buffer->state.context_roll_without_scissor_emitted = true;
8135 
8136       if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
8137          /* The array of counter buffers is optional. */
8138          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
8139          uint64_t va = radv_buffer_get_va(buffer->bo);
8140          uint64_t counter_buffer_offset = 0;
8141 
8142          if (pCounterBufferOffsets)
8143             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
8144 
8145          va += buffer->offset + counter_buffer_offset;
8146 
8147          /* Append */
8148          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
8149          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) |   /* offset in bytes */
8150                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
8151          radeon_emit(cs, 0);                                                 /* unused */
8152          radeon_emit(cs, 0);                                                 /* unused */
8153          radeon_emit(cs, va);                                                /* src address lo */
8154          radeon_emit(cs, va >> 32);                                          /* src address hi */
8155 
8156          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
8157       } else {
8158          /* Start from the beginning. */
8159          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
8160          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
8161                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
8162          radeon_emit(cs, 0);                                                    /* unused */
8163          radeon_emit(cs, 0);                                                    /* unused */
8164          radeon_emit(cs, 0);                                                    /* unused */
8165          radeon_emit(cs, 0);                                                    /* unused */
8166       }
8167    }
8168 
8169    radv_set_streamout_enable(cmd_buffer, true);
8170 }
8171 
8172 static void
gfx10_emit_streamout_begin(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8173 gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
8174                            uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8175                            const VkDeviceSize *pCounterBufferOffsets)
8176 {
8177    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8178    unsigned last_target = util_last_bit(so->enabled_mask) - 1;
8179    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8180 
8181    assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
8182    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
8183 
8184    /* Sync because the next streamout operation will overwrite GDS and we
8185     * have to make sure it's idle.
8186     * TODO: Improve by tracking if there is a streamout operation in
8187     * flight.
8188     */
8189    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
8190    si_emit_cache_flush(cmd_buffer);
8191 
8192    u_foreach_bit(i, so->enabled_mask)
8193    {
8194       int32_t counter_buffer_idx = i - firstCounterBuffer;
8195       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
8196          counter_buffer_idx = -1;
8197 
8198       bool append =
8199          counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
8200       uint64_t va = 0;
8201 
8202       if (append) {
8203          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
8204          uint64_t counter_buffer_offset = 0;
8205 
8206          if (pCounterBufferOffsets)
8207             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
8208 
8209          va += radv_buffer_get_va(buffer->bo);
8210          va += buffer->offset + counter_buffer_offset;
8211 
8212          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
8213       }
8214 
8215       radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
8216       radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
8217                          S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
8218       radeon_emit(cs, va);
8219       radeon_emit(cs, va >> 32);
8220       radeon_emit(cs, 4 * i); /* destination in GDS */
8221       radeon_emit(cs, 0);
8222       radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
8223    }
8224 
8225    radv_set_streamout_enable(cmd_buffer, true);
8226 }
8227 
8228 void
radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8229 radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
8230                                   uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8231                                   const VkDeviceSize *pCounterBufferOffsets)
8232 {
8233    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8234 
8235    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
8236       gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount,
8237                                  pCounterBuffers, pCounterBufferOffsets);
8238    } else {
8239       radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
8240                                 pCounterBufferOffsets);
8241    }
8242 }
8243 
8244 static void
radv_emit_streamout_end(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8245 radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
8246                         uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8247                         const VkDeviceSize *pCounterBufferOffsets)
8248 {
8249    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8250    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8251 
8252    radv_flush_vgt_streamout(cmd_buffer);
8253 
8254    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
8255    u_foreach_bit(i, so->enabled_mask)
8256    {
8257       int32_t counter_buffer_idx = i - firstCounterBuffer;
8258       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
8259          counter_buffer_idx = -1;
8260 
8261       if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
8262          /* The array of counters buffer is optional. */
8263          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
8264          uint64_t va = radv_buffer_get_va(buffer->bo);
8265          uint64_t counter_buffer_offset = 0;
8266 
8267          if (pCounterBufferOffsets)
8268             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
8269 
8270          va += buffer->offset + counter_buffer_offset;
8271 
8272          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
8273          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
8274                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
8275                             STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
8276          radeon_emit(cs, va);                                  /* dst address lo */
8277          radeon_emit(cs, va >> 32);                            /* dst address hi */
8278          radeon_emit(cs, 0);                                   /* unused */
8279          radeon_emit(cs, 0);                                   /* unused */
8280 
8281          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
8282       }
8283 
8284       /* Deactivate transform feedback by zeroing the buffer size.
8285        * The counters (primitives generated, primitives emitted) may
8286        * be enabled even if there is not buffer bound. This ensures
8287        * that the primitives-emitted query won't increment.
8288        */
8289       radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
8290 
8291       cmd_buffer->state.context_roll_without_scissor_emitted = true;
8292    }
8293 
8294    radv_set_streamout_enable(cmd_buffer, false);
8295 }
8296 
8297 static void
gfx10_emit_streamout_end(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8298 gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
8299                          uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8300                          const VkDeviceSize *pCounterBufferOffsets)
8301 {
8302    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
8303    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8304 
8305    assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
8306    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
8307 
8308    u_foreach_bit(i, so->enabled_mask)
8309    {
8310       int32_t counter_buffer_idx = i - firstCounterBuffer;
8311       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
8312          counter_buffer_idx = -1;
8313 
8314       if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
8315          /* The array of counters buffer is optional. */
8316          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
8317          uint64_t va = radv_buffer_get_va(buffer->bo);
8318          uint64_t counter_buffer_offset = 0;
8319 
8320          if (pCounterBufferOffsets)
8321             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
8322 
8323          va += buffer->offset + counter_buffer_offset;
8324 
8325          si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
8326                                     radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0,
8327                                     EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0);
8328 
8329          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
8330       }
8331    }
8332 
8333    radv_set_streamout_enable(cmd_buffer, false);
8334 }
8335 
8336 void
radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)8337 radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
8338                                 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
8339                                 const VkDeviceSize *pCounterBufferOffsets)
8340 {
8341    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8342 
8343    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
8344       gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
8345                                pCounterBufferOffsets);
8346    } else {
8347       radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
8348                               pCounterBufferOffsets);
8349    }
8350 }
8351 
8352 void
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer _counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)8353 radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount,
8354                                  uint32_t firstInstance, VkBuffer _counterBuffer,
8355                                  VkDeviceSize counterBufferOffset, uint32_t counterOffset,
8356                                  uint32_t vertexStride)
8357 {
8358    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8359    RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
8360    struct radv_draw_info info;
8361 
8362    info.count = 0;
8363    info.instance_count = instanceCount;
8364    info.first_instance = firstInstance;
8365    info.strmout_buffer = counterBuffer;
8366    info.strmout_buffer_offset = counterBufferOffset;
8367    info.stride = vertexStride;
8368    info.indexed = false;
8369    info.indirect = NULL;
8370 
8371    if (!radv_before_draw(cmd_buffer, &info, 1))
8372       return;
8373    struct VkMultiDrawInfoEXT minfo = { 0, 0 };
8374    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
8375    radv_after_draw(cmd_buffer);
8376 }
8377 
8378 /* VK_AMD_buffer_marker */
8379 void
radv_CmdWriteBufferMarkerAMD(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits pipelineStage,VkBuffer dstBuffer,VkDeviceSize dstOffset,uint32_t marker)8380 radv_CmdWriteBufferMarkerAMD(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage,
8381                              VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker)
8382 {
8383    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8384    RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
8385    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8386    uint64_t va = radv_buffer_get_va(buffer->bo) + dstOffset;
8387 
8388    si_emit_cache_flush(cmd_buffer);
8389 
8390    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
8391 
8392    if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
8393       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
8394       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
8395                          COPY_DATA_WR_CONFIRM);
8396       radeon_emit(cs, marker);
8397       radeon_emit(cs, 0);
8398       radeon_emit(cs, va);
8399       radeon_emit(cs, va >> 32);
8400    } else {
8401       si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
8402                                  radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
8403                                  0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker,
8404                                  cmd_buffer->gfx9_eop_bug_va);
8405    }
8406 
8407    assert(cmd_buffer->cs->cdw <= cdw_max);
8408 }
8409