1 /*
2  * Copyright © 2019 Raspberry Pi
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "util/u_pack_color.h"
26 #include "vk_format_info.h"
27 #include "vk_util.h"
28 
29 const struct v3dv_dynamic_state default_dynamic_state = {
30    .viewport = {
31       .count = 0,
32    },
33    .scissor = {
34       .count = 0,
35    },
36    .stencil_compare_mask =
37    {
38      .front = ~0u,
39      .back = ~0u,
40    },
41    .stencil_write_mask =
42    {
43      .front = ~0u,
44      .back = ~0u,
45    },
46    .stencil_reference =
47    {
48      .front = 0u,
49      .back = 0u,
50    },
51    .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
52    .depth_bias = {
53       .constant_factor = 0.0f,
54       .depth_bias_clamp = 0.0f,
55       .slope_factor = 0.0f,
56    },
57    .line_width = 1.0f,
58    .color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1,
59 };
60 
61 void
v3dv_job_add_bo(struct v3dv_job * job,struct v3dv_bo * bo)62 v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
63 {
64    if (!bo)
65       return;
66 
67    if (job->bo_handle_mask & bo->handle_bit) {
68       if (_mesa_set_search(job->bos, bo))
69          return;
70    }
71 
72    _mesa_set_add(job->bos, bo);
73    job->bo_count++;
74    job->bo_handle_mask |= bo->handle_bit;
75 }
76 
77 void
v3dv_job_add_bo_unchecked(struct v3dv_job * job,struct v3dv_bo * bo)78 v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo)
79 {
80    assert(bo);
81    _mesa_set_add(job->bos, bo);
82    job->bo_count++;
83    job->bo_handle_mask |= bo->handle_bit;
84 }
85 
86 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateCommandPool(VkDevice _device,const VkCommandPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkCommandPool * pCmdPool)87 v3dv_CreateCommandPool(VkDevice _device,
88                        const VkCommandPoolCreateInfo *pCreateInfo,
89                        const VkAllocationCallbacks *pAllocator,
90                        VkCommandPool *pCmdPool)
91 {
92    V3DV_FROM_HANDLE(v3dv_device, device, _device);
93    struct v3dv_cmd_pool *pool;
94 
95    /* We only support one queue */
96    assert(pCreateInfo->queueFamilyIndex == 0);
97 
98    pool = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
99                            VK_OBJECT_TYPE_COMMAND_POOL);
100    if (pool == NULL)
101       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
102 
103    if (pAllocator)
104       pool->alloc = *pAllocator;
105    else
106       pool->alloc = device->vk.alloc;
107 
108    list_inithead(&pool->cmd_buffers);
109 
110    *pCmdPool = v3dv_cmd_pool_to_handle(pool);
111 
112    return VK_SUCCESS;
113 }
114 
115 static void
cmd_buffer_init(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_device * device,struct v3dv_cmd_pool * pool,VkCommandBufferLevel level)116 cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
117                 struct v3dv_device *device,
118                 struct v3dv_cmd_pool *pool,
119                 VkCommandBufferLevel level)
120 {
121    /* Do not reset the base object! If we are calling this from a command
122     * buffer reset that would reset the loader's dispatch table for the
123     * command buffer, and any other relevant info from vk_object_base
124     */
125    const uint32_t base_size = sizeof(struct vk_command_buffer);
126    uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size;
127    memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size);
128 
129    cmd_buffer->device = device;
130    cmd_buffer->pool = pool;
131    cmd_buffer->level = level;
132 
133    list_inithead(&cmd_buffer->private_objs);
134    list_inithead(&cmd_buffer->jobs);
135    list_inithead(&cmd_buffer->list_link);
136 
137    assert(pool);
138    list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
139 
140    cmd_buffer->state.subpass_idx = -1;
141    cmd_buffer->state.meta.subpass_idx = -1;
142 
143    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_INITIALIZED;
144 }
145 
146 static VkResult
cmd_buffer_create(struct v3dv_device * device,struct v3dv_cmd_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)147 cmd_buffer_create(struct v3dv_device *device,
148                   struct v3dv_cmd_pool *pool,
149                   VkCommandBufferLevel level,
150                   VkCommandBuffer *pCommandBuffer)
151 {
152    struct v3dv_cmd_buffer *cmd_buffer;
153    cmd_buffer = vk_zalloc2(&device->vk.alloc,
154                            &pool->alloc,
155                            sizeof(*cmd_buffer),
156                            8,
157                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
158    if (cmd_buffer == NULL)
159       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
160 
161    VkResult result;
162    result = vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
163    if (result != VK_SUCCESS) {
164       vk_free2(&device->vk.alloc, &pool->alloc, cmd_buffer);
165       return result;
166    }
167 
168    cmd_buffer_init(cmd_buffer, device, pool, level);
169 
170    *pCommandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
171 
172    return VK_SUCCESS;
173 }
174 
175 static void
job_destroy_gpu_cl_resources(struct v3dv_job * job)176 job_destroy_gpu_cl_resources(struct v3dv_job *job)
177 {
178    assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
179           job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
180 
181    v3dv_cl_destroy(&job->bcl);
182    v3dv_cl_destroy(&job->rcl);
183    v3dv_cl_destroy(&job->indirect);
184 
185    /* Since we don't ref BOs when we add them to the command buffer, don't
186     * unref them here either. Bo's will be freed when their corresponding API
187     * objects are destroyed.
188     */
189    _mesa_set_destroy(job->bos, NULL);
190 
191    v3dv_bo_free(job->device, job->tile_alloc);
192    v3dv_bo_free(job->device, job->tile_state);
193 }
194 
195 static void
job_destroy_cloned_gpu_cl_resources(struct v3dv_job * job)196 job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job)
197 {
198    assert(job->type == V3DV_JOB_TYPE_GPU_CL);
199 
200    list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
201       list_del(&bo->list_link);
202       vk_free(&job->device->vk.alloc, bo);
203    }
204 
205    list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) {
206       list_del(&bo->list_link);
207       vk_free(&job->device->vk.alloc, bo);
208    }
209 
210    list_for_each_entry_safe(struct v3dv_bo, bo, &job->indirect.bo_list, list_link) {
211       list_del(&bo->list_link);
212       vk_free(&job->device->vk.alloc, bo);
213    }
214 }
215 
216 static void
job_destroy_gpu_csd_resources(struct v3dv_job * job)217 job_destroy_gpu_csd_resources(struct v3dv_job *job)
218 {
219    assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
220    assert(job->cmd_buffer);
221 
222    v3dv_cl_destroy(&job->indirect);
223 
224    _mesa_set_destroy(job->bos, NULL);
225 
226    if (job->csd.shared_memory)
227       v3dv_bo_free(job->device, job->csd.shared_memory);
228 }
229 
230 static void
job_destroy_cpu_wait_events_resources(struct v3dv_job * job)231 job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
232 {
233    assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
234    assert(job->cmd_buffer);
235    vk_free(&job->cmd_buffer->device->vk.alloc, job->cpu.event_wait.events);
236 }
237 
238 static void
job_destroy_cpu_csd_indirect_resources(struct v3dv_job * job)239 job_destroy_cpu_csd_indirect_resources(struct v3dv_job *job)
240 {
241    assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
242    assert(job->cmd_buffer);
243    v3dv_job_destroy(job->cpu.csd_indirect.csd_job);
244 }
245 
246 void
v3dv_job_destroy(struct v3dv_job * job)247 v3dv_job_destroy(struct v3dv_job *job)
248 {
249    assert(job);
250 
251    list_del(&job->list_link);
252 
253    /* Cloned jobs don't make deep copies of the original jobs, so they don't
254     * own any of their resources. However, they do allocate clones of BO
255     * structs, so make sure we free those.
256     */
257    if (!job->is_clone) {
258       switch (job->type) {
259       case V3DV_JOB_TYPE_GPU_CL:
260       case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
261          job_destroy_gpu_cl_resources(job);
262          break;
263       case V3DV_JOB_TYPE_GPU_CSD:
264          job_destroy_gpu_csd_resources(job);
265          break;
266       case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
267          job_destroy_cpu_wait_events_resources(job);
268          break;
269       case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
270          job_destroy_cpu_csd_indirect_resources(job);
271          break;
272       default:
273          break;
274       }
275    } else {
276       /* Cloned jobs */
277       if (job->type == V3DV_JOB_TYPE_GPU_CL)
278          job_destroy_cloned_gpu_cl_resources(job);
279    }
280 
281    vk_free(&job->device->vk.alloc, job);
282 }
283 
284 void
v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer * cmd_buffer,uint64_t obj,v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)285 v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
286                                 uint64_t obj,
287                                 v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)
288 {
289    struct v3dv_cmd_buffer_private_obj *pobj =
290       vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(*pobj), 8,
291                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
292    if (!pobj) {
293       v3dv_flag_oom(cmd_buffer, NULL);
294       return;
295    }
296 
297    pobj->obj = obj;
298    pobj->destroy_cb = destroy_cb;
299 
300    list_addtail(&pobj->list_link, &cmd_buffer->private_objs);
301 }
302 
303 static void
cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cmd_buffer_private_obj * pobj)304 cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
305                                struct v3dv_cmd_buffer_private_obj *pobj)
306 {
307    assert(pobj && pobj->obj && pobj->destroy_cb);
308    pobj->destroy_cb(v3dv_device_to_handle(cmd_buffer->device),
309                     pobj->obj,
310                     &cmd_buffer->device->vk.alloc);
311    list_del(&pobj->list_link);
312    vk_free(&cmd_buffer->device->vk.alloc, pobj);
313 }
314 
315 static void
cmd_buffer_free_resources(struct v3dv_cmd_buffer * cmd_buffer)316 cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
317 {
318    list_for_each_entry_safe(struct v3dv_job, job,
319                             &cmd_buffer->jobs, list_link) {
320       v3dv_job_destroy(job);
321    }
322 
323    if (cmd_buffer->state.job)
324       v3dv_job_destroy(cmd_buffer->state.job);
325 
326    if (cmd_buffer->state.attachments)
327       vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
328 
329    if (cmd_buffer->state.query.end.alloc_count > 0)
330       vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states);
331 
332    if (cmd_buffer->push_constants_resource.bo)
333       v3dv_bo_free(cmd_buffer->device, cmd_buffer->push_constants_resource.bo);
334 
335    list_for_each_entry_safe(struct v3dv_cmd_buffer_private_obj, pobj,
336                             &cmd_buffer->private_objs, list_link) {
337       cmd_buffer_destroy_private_obj(cmd_buffer, pobj);
338    }
339 
340    if (cmd_buffer->state.meta.attachments) {
341          assert(cmd_buffer->state.meta.attachment_alloc_count > 0);
342          vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments);
343    }
344 }
345 
346 static void
cmd_buffer_destroy(struct v3dv_cmd_buffer * cmd_buffer)347 cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
348 {
349    list_del(&cmd_buffer->pool_link);
350    cmd_buffer_free_resources(cmd_buffer);
351    vk_command_buffer_finish(&cmd_buffer->vk);
352    vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->pool->alloc,
353             cmd_buffer);
354 }
355 
356 static bool
attachment_list_is_subset(struct v3dv_subpass_attachment * l1,uint32_t l1_count,struct v3dv_subpass_attachment * l2,uint32_t l2_count)357 attachment_list_is_subset(struct v3dv_subpass_attachment *l1, uint32_t l1_count,
358                           struct v3dv_subpass_attachment *l2, uint32_t l2_count)
359 {
360    for (uint32_t i = 0; i < l1_count; i++) {
361       uint32_t attachment_idx = l1[i].attachment;
362       if (attachment_idx == VK_ATTACHMENT_UNUSED)
363          continue;
364 
365       uint32_t j;
366       for (j = 0; j < l2_count; j++) {
367          if (l2[j].attachment == attachment_idx)
368             break;
369       }
370       if (j == l2_count)
371          return false;
372    }
373 
374    return true;
375  }
376 
377 static bool
cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)378 cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
379                              uint32_t subpass_idx)
380 {
381    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
382    assert(state->pass);
383 
384    const struct v3dv_physical_device *physical_device =
385       &cmd_buffer->device->instance->physicalDevice;
386 
387    if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
388       return false;
389 
390    if (!cmd_buffer->state.job)
391       return false;
392 
393    if (cmd_buffer->state.job->always_flush)
394       return false;
395 
396    if (!physical_device->options.merge_jobs)
397       return false;
398 
399    /* Each render pass starts a new job */
400    if (subpass_idx == 0)
401       return false;
402 
403    /* Two subpasses can be merged in the same job if we can emit a single RCL
404     * for them (since the RCL includes the END_OF_RENDERING command that
405     * triggers the "render job finished" interrupt). We can do this so long
406     * as both subpasses render against the same attachments.
407     */
408    assert(state->subpass_idx == subpass_idx - 1);
409    struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx];
410    struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx];
411 
412    /* Don't merge if the subpasses have different view masks, since in that
413     * case the framebuffer setup is different and we need to emit different
414     * RCLs.
415     */
416    if (subpass->view_mask != prev_subpass->view_mask)
417       return false;
418 
419    /* Because the list of subpass attachments can include VK_ATTACHMENT_UNUSED,
420     * we need to check that for each subpass all its used attachments are
421     * used by the other subpass.
422     */
423    bool compatible =
424       attachment_list_is_subset(prev_subpass->color_attachments,
425                                 prev_subpass->color_count,
426                                 subpass->color_attachments,
427                                 subpass->color_count);
428    if (!compatible)
429       return false;
430 
431    compatible =
432       attachment_list_is_subset(subpass->color_attachments,
433                                 subpass->color_count,
434                                 prev_subpass->color_attachments,
435                                 prev_subpass->color_count);
436    if (!compatible)
437       return false;
438 
439    if (subpass->ds_attachment.attachment !=
440        prev_subpass->ds_attachment.attachment)
441       return false;
442 
443    /* FIXME: Since some attachment formats can't be resolved using the TLB we
444     * need to emit separate resolve jobs for them and that would not be
445     * compatible with subpass merges. We could fix that by testing if any of
446     * the attachments to resolve doesn't suppotr TLB resolves.
447     */
448    if (prev_subpass->resolve_attachments || subpass->resolve_attachments)
449       return false;
450 
451    return true;
452 }
453 
454 /**
455  * Computes and sets the job frame tiling information required to setup frame
456  * binning and rendering.
457  */
458 static struct v3dv_frame_tiling *
job_compute_frame_tiling(struct v3dv_job * job,uint32_t width,uint32_t height,uint32_t layers,uint32_t render_target_count,uint8_t max_internal_bpp,bool msaa)459 job_compute_frame_tiling(struct v3dv_job *job,
460                          uint32_t width,
461                          uint32_t height,
462                          uint32_t layers,
463                          uint32_t render_target_count,
464                          uint8_t max_internal_bpp,
465                          bool msaa)
466 {
467    static const uint8_t tile_sizes[] = {
468       64, 64,
469       64, 32,
470       32, 32,
471       32, 16,
472       16, 16,
473       16,  8,
474        8,  8
475    };
476 
477    assert(job);
478    struct v3dv_frame_tiling *tiling = &job->frame_tiling;
479 
480    tiling->width = width;
481    tiling->height = height;
482    tiling->layers = layers;
483    tiling->render_target_count = render_target_count;
484    tiling->msaa = msaa;
485 
486    uint32_t tile_size_index = 0;
487 
488    if (render_target_count > 2)
489       tile_size_index += 2;
490    else if (render_target_count > 1)
491       tile_size_index += 1;
492 
493    if (msaa)
494       tile_size_index += 2;
495 
496    tiling->internal_bpp = max_internal_bpp;
497    tile_size_index += tiling->internal_bpp;
498    assert(tile_size_index < ARRAY_SIZE(tile_sizes) / 2);
499 
500    tiling->tile_width = tile_sizes[tile_size_index * 2];
501    tiling->tile_height = tile_sizes[tile_size_index * 2 + 1];
502 
503    tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
504    tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height);
505 
506    /* Size up our supertiles until we get under the limit */
507    const uint32_t max_supertiles = 256;
508    tiling->supertile_width = 1;
509    tiling->supertile_height = 1;
510    for (;;) {
511       tiling->frame_width_in_supertiles =
512          DIV_ROUND_UP(tiling->draw_tiles_x, tiling->supertile_width);
513       tiling->frame_height_in_supertiles =
514          DIV_ROUND_UP(tiling->draw_tiles_y, tiling->supertile_height);
515       const uint32_t num_supertiles = tiling->frame_width_in_supertiles *
516                                       tiling->frame_height_in_supertiles;
517       if (num_supertiles < max_supertiles)
518          break;
519 
520       if (tiling->supertile_width < tiling->supertile_height)
521          tiling->supertile_width++;
522       else
523          tiling->supertile_height++;
524    }
525 
526    return tiling;
527 }
528 
529 void
v3dv_job_start_frame(struct v3dv_job * job,uint32_t width,uint32_t height,uint32_t layers,bool allocate_tile_state_for_all_layers,uint32_t render_target_count,uint8_t max_internal_bpp,bool msaa)530 v3dv_job_start_frame(struct v3dv_job *job,
531                      uint32_t width,
532                      uint32_t height,
533                      uint32_t layers,
534                      bool allocate_tile_state_for_all_layers,
535                      uint32_t render_target_count,
536                      uint8_t max_internal_bpp,
537                      bool msaa)
538 {
539    assert(job);
540 
541    /* Start by computing frame tiling spec for this job */
542    const struct v3dv_frame_tiling *tiling =
543       job_compute_frame_tiling(job,
544                                width, height, layers,
545                                render_target_count, max_internal_bpp, msaa);
546 
547    v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
548    v3dv_return_if_oom(NULL, job);
549 
550    /* We only need to allocate tile state for all layers if the binner
551     * writes primitives to layers other than the first. This can only be
552     * done using layered rendering (writing gl_Layer from a geometry shader),
553     * so for other cases of multilayered framebuffers (typically with
554     * meta copy/clear operations) that won't use layered rendering, we only
555     * need one layer worth of of tile state for the binner.
556     */
557    if (!allocate_tile_state_for_all_layers)
558       layers = 1;
559 
560    /* The PTB will request the tile alloc initial size per tile at start
561     * of tile binning.
562     */
563    uint32_t tile_alloc_size = 64 * tiling->layers *
564                               tiling->draw_tiles_x *
565                               tiling->draw_tiles_y;
566 
567    /* The PTB allocates in aligned 4k chunks after the initial setup. */
568    tile_alloc_size = align(tile_alloc_size, 4096);
569 
570    /* Include the first two chunk allocations that the PTB does so that
571     * we definitely clear the OOM condition before triggering one (the HW
572     * won't trigger OOM during the first allocations).
573     */
574    tile_alloc_size += 8192;
575 
576    /* For performance, allocate some extra initial memory after the PTB's
577     * minimal allocations, so that we hopefully don't have to block the
578     * GPU on the kernel handling an OOM signal.
579     */
580    tile_alloc_size += 512 * 1024;
581 
582    job->tile_alloc = v3dv_bo_alloc(job->device, tile_alloc_size,
583                                    "tile_alloc", true);
584    if (!job->tile_alloc) {
585       v3dv_flag_oom(NULL, job);
586       return;
587    }
588 
589    v3dv_job_add_bo_unchecked(job, job->tile_alloc);
590 
591    const uint32_t tsda_per_tile_size = 256;
592    const uint32_t tile_state_size = tiling->layers *
593                                     tiling->draw_tiles_x *
594                                     tiling->draw_tiles_y *
595                                     tsda_per_tile_size;
596    job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true);
597    if (!job->tile_state) {
598       v3dv_flag_oom(NULL, job);
599       return;
600    }
601 
602    v3dv_job_add_bo_unchecked(job, job->tile_state);
603 
604    v3dv_X(job->device, job_emit_binning_prolog)(job, tiling, layers);
605 
606    job->ez_state = V3D_EZ_UNDECIDED;
607    job->first_ez_state = V3D_EZ_UNDECIDED;
608 }
609 
610 static void
cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer * cmd_buffer)611 cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
612 {
613    assert(cmd_buffer->state.job);
614 
615    /* Typically, we have a single job for each subpass and we emit the job's RCL
616     * here when we are ending the frame for the subpass. However, some commands
617     * such as vkCmdClearAttachments need to run in their own separate job and
618     * they emit their own RCL even if they execute inside a subpass. In this
619     * scenario, we don't want to emit subpass RCL when we end the frame for
620     * those jobs, so we only emit the subpass RCL if the job has not recorded
621     * any RCL commands of its own.
622     */
623    if (v3dv_cl_offset(&cmd_buffer->state.job->rcl) == 0)
624       v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer);
625 
626    v3dv_X(cmd_buffer->device, job_emit_binning_flush)(cmd_buffer->state.job);
627 }
628 
629 struct v3dv_job *
v3dv_cmd_buffer_create_cpu_job(struct v3dv_device * device,enum v3dv_job_type type,struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)630 v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,
631                                enum v3dv_job_type type,
632                                struct v3dv_cmd_buffer *cmd_buffer,
633                                uint32_t subpass_idx)
634 {
635    struct v3dv_job *job = vk_zalloc(&device->vk.alloc,
636                                     sizeof(struct v3dv_job), 8,
637                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
638    if (!job) {
639       v3dv_flag_oom(cmd_buffer, NULL);
640       return NULL;
641    }
642 
643    v3dv_job_init(job, type, device, cmd_buffer, subpass_idx);
644    return job;
645 }
646 
647 static void
cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer * cmd_buffer)648 cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer)
649 {
650    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
651 
652    if (state->query.end.used_count > 0) {
653       const uint32_t query_count = state->query.end.used_count;
654       for (uint32_t i = 0; i < query_count; i++) {
655          assert(i < state->query.end.used_count);
656          struct v3dv_job *job =
657             v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
658                                            V3DV_JOB_TYPE_CPU_END_QUERY,
659                                            cmd_buffer, -1);
660          v3dv_return_if_oom(cmd_buffer, NULL);
661 
662          job->cpu.query_end = state->query.end.states[i];
663          list_addtail(&job->list_link, &cmd_buffer->jobs);
664       }
665    }
666 }
667 
668 void
v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer * cmd_buffer)669 v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
670 {
671    struct v3dv_job *job = cmd_buffer->state.job;
672    if (!job)
673       return;
674 
675    if (cmd_buffer->state.oom) {
676       v3dv_job_destroy(job);
677       cmd_buffer->state.job = NULL;
678       return;
679    }
680 
681    /* If we have created a job for a command buffer then we should have
682     * recorded something into it: if the job was started in a render pass, it
683     * should at least have the start frame commands, otherwise, it should have
684     * a transfer command. The only exception are secondary command buffers
685     * inside a render pass.
686     */
687    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
688           v3dv_cl_offset(&job->bcl) > 0);
689 
690    /* When we merge multiple subpasses into the same job we must only emit one
691     * RCL, so we do that here, when we decided that we need to finish the job.
692     * Any rendering that happens outside a render pass is never merged, so
693     * the RCL should have been emitted by the time we got here.
694     */
695    assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass);
696 
697    /* If we are finishing a job inside a render pass we have two scenarios:
698     *
699     * 1. It is a regular CL, in which case we will submit the job to the GPU,
700     *    so we may need to generate an RCL and add a binning flush.
701     *
702     * 2. It is a partial CL recorded in a secondary command buffer, in which
703     *    case we are not submitting it directly to the GPU but rather branch to
704     *    it from a primary command buffer. In this case we just want to end
705     *    the BCL with a RETURN_FROM_SUB_LIST and the RCL and binning flush
706     *    will be the primary job that branches to this CL.
707     */
708    if (cmd_buffer->state.pass) {
709       if (job->type == V3DV_JOB_TYPE_GPU_CL) {
710          cmd_buffer_end_render_pass_frame(cmd_buffer);
711       } else {
712          assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
713          v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer);
714       }
715    }
716 
717    list_addtail(&job->list_link, &cmd_buffer->jobs);
718    cmd_buffer->state.job = NULL;
719 
720    /* If we have recorded any state with this last GPU job that requires to
721     * emit CPU jobs after the job is completed, add them now. The only
722     * exception is secondary command buffers inside a render pass, because in
723     * that case we want to defer this until we finish recording the primary
724     * job into which we execute the secondary.
725     */
726    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
727        !cmd_buffer->state.pass) {
728       cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer);
729    }
730 }
731 
732 static bool
job_type_is_gpu(struct v3dv_job * job)733 job_type_is_gpu(struct v3dv_job *job)
734 {
735    switch (job->type) {
736    case V3DV_JOB_TYPE_GPU_CL:
737    case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
738    case V3DV_JOB_TYPE_GPU_TFU:
739    case V3DV_JOB_TYPE_GPU_CSD:
740       return true;
741    default:
742       return false;
743    }
744 }
745 
746 static void
cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_job * job)747 cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer,
748                                    struct v3dv_job *job)
749 {
750    assert(cmd_buffer && job);
751 
752    if (!cmd_buffer->state.has_barrier)
753       return;
754 
755    /* Serialization only affects GPU jobs, CPU jobs are always automatically
756     * serialized.
757     */
758    if (!job_type_is_gpu(job))
759       return;
760 
761    job->serialize = true;
762    if (cmd_buffer->state.has_bcl_barrier &&
763        (job->type == V3DV_JOB_TYPE_GPU_CL ||
764         job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY)) {
765       job->needs_bcl_sync = true;
766    }
767 
768    cmd_buffer->state.has_barrier = false;
769    cmd_buffer->state.has_bcl_barrier = false;
770 }
771 
772 void
v3dv_job_init(struct v3dv_job * job,enum v3dv_job_type type,struct v3dv_device * device,struct v3dv_cmd_buffer * cmd_buffer,int32_t subpass_idx)773 v3dv_job_init(struct v3dv_job *job,
774               enum v3dv_job_type type,
775               struct v3dv_device *device,
776               struct v3dv_cmd_buffer *cmd_buffer,
777               int32_t subpass_idx)
778 {
779    assert(job);
780 
781    /* Make sure we haven't made this new job current before calling here */
782    assert(!cmd_buffer || cmd_buffer->state.job != job);
783 
784    job->type = type;
785 
786    job->device = device;
787    job->cmd_buffer = cmd_buffer;
788 
789    list_inithead(&job->list_link);
790 
791    if (type == V3DV_JOB_TYPE_GPU_CL ||
792        type == V3DV_JOB_TYPE_GPU_CL_SECONDARY ||
793        type == V3DV_JOB_TYPE_GPU_CSD) {
794       job->bos =
795          _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
796       job->bo_count = 0;
797 
798       v3dv_cl_init(job, &job->indirect);
799 
800       if (unlikely(V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH))
801          job->always_flush = true;
802    }
803 
804    if (type == V3DV_JOB_TYPE_GPU_CL ||
805        type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
806       v3dv_cl_init(job, &job->bcl);
807       v3dv_cl_init(job, &job->rcl);
808    }
809 
810    if (cmd_buffer) {
811       /* Flag all state as dirty. Generally, we need to re-emit state for each
812        * new job.
813        *
814        * FIXME: there may be some exceptions, in which case we could skip some
815        * bits.
816        */
817       cmd_buffer->state.dirty = ~0;
818       cmd_buffer->state.dirty_descriptor_stages = ~0;
819 
820       /* Honor inheritance of occlussion queries in secondaries if requested */
821       if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
822           cmd_buffer->state.inheritance.occlusion_query_enable) {
823          cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
824       }
825 
826       /* Keep track of the first subpass that we are recording in this new job.
827        * We will use this when we emit the RCL to decide how to emit our loads
828        * and stores.
829        */
830       if (cmd_buffer->state.pass)
831          job->first_subpass = subpass_idx;
832 
833       cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
834    }
835 }
836 
837 struct v3dv_job *
v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer * cmd_buffer,int32_t subpass_idx,enum v3dv_job_type type)838 v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
839                           int32_t subpass_idx,
840                           enum v3dv_job_type type)
841 {
842    /* Don't create a new job if we can merge the current subpass into
843     * the current job.
844     */
845    if (cmd_buffer->state.pass &&
846        subpass_idx != -1 &&
847        cmd_buffer_can_merge_subpass(cmd_buffer, subpass_idx)) {
848       cmd_buffer->state.job->is_subpass_finish = false;
849       return cmd_buffer->state.job;
850    }
851 
852    /* Ensure we are not starting a new job without finishing a previous one */
853    if (cmd_buffer->state.job != NULL)
854       v3dv_cmd_buffer_finish_job(cmd_buffer);
855 
856    assert(cmd_buffer->state.job == NULL);
857    struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
858                                     sizeof(struct v3dv_job), 8,
859                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
860 
861    if (!job) {
862       fprintf(stderr, "Error: failed to allocate CPU memory for job\n");
863       v3dv_flag_oom(cmd_buffer, NULL);
864       return NULL;
865    }
866 
867    v3dv_job_init(job, type, cmd_buffer->device, cmd_buffer, subpass_idx);
868    cmd_buffer->state.job = job;
869 
870    return job;
871 }
872 
873 static VkResult
cmd_buffer_reset(struct v3dv_cmd_buffer * cmd_buffer,VkCommandBufferResetFlags flags)874 cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
875                  VkCommandBufferResetFlags flags)
876 {
877    vk_command_buffer_reset(&cmd_buffer->vk);
878    if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
879       struct v3dv_device *device = cmd_buffer->device;
880       struct v3dv_cmd_pool *pool = cmd_buffer->pool;
881       VkCommandBufferLevel level = cmd_buffer->level;
882 
883       /* cmd_buffer_init below will re-add the command buffer to the pool
884        * so remove it here so we don't end up adding it again.
885        */
886       list_del(&cmd_buffer->pool_link);
887 
888       /* FIXME: For now we always free all resources as if
889        * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
890        */
891       if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW)
892          cmd_buffer_free_resources(cmd_buffer);
893 
894       cmd_buffer_init(cmd_buffer, device, pool, level);
895    }
896 
897    assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
898    return VK_SUCCESS;
899 }
900 
901 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)902 v3dv_AllocateCommandBuffers(VkDevice _device,
903                             const VkCommandBufferAllocateInfo *pAllocateInfo,
904                             VkCommandBuffer *pCommandBuffers)
905 {
906    V3DV_FROM_HANDLE(v3dv_device, device, _device);
907    V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, pAllocateInfo->commandPool);
908 
909    VkResult result = VK_SUCCESS;
910    uint32_t i;
911 
912    for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
913       result = cmd_buffer_create(device, pool, pAllocateInfo->level,
914                                  &pCommandBuffers[i]);
915       if (result != VK_SUCCESS)
916          break;
917    }
918 
919    if (result != VK_SUCCESS) {
920       v3dv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
921                               i, pCommandBuffers);
922       for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
923          pCommandBuffers[i] = VK_NULL_HANDLE;
924    }
925 
926    return result;
927 }
928 
929 VKAPI_ATTR void VKAPI_CALL
v3dv_FreeCommandBuffers(VkDevice device,VkCommandPool commandPool,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)930 v3dv_FreeCommandBuffers(VkDevice device,
931                         VkCommandPool commandPool,
932                         uint32_t commandBufferCount,
933                         const VkCommandBuffer *pCommandBuffers)
934 {
935    for (uint32_t i = 0; i < commandBufferCount; i++) {
936       V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
937 
938       if (!cmd_buffer)
939          continue;
940 
941       cmd_buffer_destroy(cmd_buffer);
942    }
943 }
944 
945 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyCommandPool(VkDevice _device,VkCommandPool commandPool,const VkAllocationCallbacks * pAllocator)946 v3dv_DestroyCommandPool(VkDevice _device,
947                         VkCommandPool commandPool,
948                         const VkAllocationCallbacks *pAllocator)
949 {
950    V3DV_FROM_HANDLE(v3dv_device, device, _device);
951    V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool);
952 
953    if (!pool)
954       return;
955 
956    list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer,
957                             &pool->cmd_buffers, pool_link) {
958       cmd_buffer_destroy(cmd_buffer);
959    }
960 
961    vk_object_free(&device->vk, pAllocator, pool);
962 }
963 
964 VKAPI_ATTR void VKAPI_CALL
v3dv_TrimCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolTrimFlags flags)965 v3dv_TrimCommandPool(VkDevice device,
966                      VkCommandPool commandPool,
967                      VkCommandPoolTrimFlags flags)
968 {
969    /* We don't need to do anything here, our command pools never hold on to
970     * any resources from command buffers that are freed or reset.
971     */
972 }
973 
974 
975 static void
cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer * cmd_buffer)976 cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
977 {
978    assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
979    const struct v3dv_render_pass *pass = cmd_buffer->state.pass;
980    const struct v3dv_subpass *subpass =
981       &pass->subpasses[cmd_buffer->state.subpass_idx];
982 
983    if (!subpass->resolve_attachments)
984       return;
985 
986    struct v3dv_framebuffer *fb = cmd_buffer->state.framebuffer;
987 
988    /* At this point we have already ended the current subpass and now we are
989     * about to emit vkCmdResolveImage calls to get the resolves we can't handle
990     * handle in the subpass RCL.
991     *
992     * vkCmdResolveImage is not supposed to be called inside a render pass so
993     * before we call that we need to make sure our command buffer state reflects
994     * that we are no longer in a subpass by finishing the current job and
995     * resetting the framebuffer and render pass state temporarily and then
996     * restoring it after we are done with the resolves.
997     */
998    if (cmd_buffer->state.job)
999       v3dv_cmd_buffer_finish_job(cmd_buffer);
1000    struct v3dv_framebuffer *restore_fb = cmd_buffer->state.framebuffer;
1001    struct v3dv_render_pass *restore_pass = cmd_buffer->state.pass;
1002    uint32_t restore_subpass_idx = cmd_buffer->state.subpass_idx;
1003    cmd_buffer->state.framebuffer = NULL;
1004    cmd_buffer->state.pass = NULL;
1005    cmd_buffer->state.subpass_idx = -1;
1006 
1007    VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
1008    for (uint32_t i = 0; i < subpass->color_count; i++) {
1009       const uint32_t src_attachment_idx =
1010          subpass->color_attachments[i].attachment;
1011       if (src_attachment_idx == VK_ATTACHMENT_UNUSED)
1012          continue;
1013 
1014       if (pass->attachments[src_attachment_idx].use_tlb_resolve)
1015          continue;
1016 
1017       const uint32_t dst_attachment_idx =
1018          subpass->resolve_attachments[i].attachment;
1019       if (dst_attachment_idx == VK_ATTACHMENT_UNUSED)
1020          continue;
1021 
1022       struct v3dv_image_view *src_iview = fb->attachments[src_attachment_idx];
1023       struct v3dv_image_view *dst_iview = fb->attachments[dst_attachment_idx];
1024 
1025       VkImageResolve2KHR region = {
1026          .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2_KHR,
1027          .srcSubresource = {
1028             VK_IMAGE_ASPECT_COLOR_BIT,
1029             src_iview->vk.base_mip_level,
1030             src_iview->vk.base_array_layer,
1031             src_iview->vk.layer_count,
1032          },
1033          .srcOffset = { 0, 0, 0 },
1034          .dstSubresource =  {
1035             VK_IMAGE_ASPECT_COLOR_BIT,
1036             dst_iview->vk.base_mip_level,
1037             dst_iview->vk.base_array_layer,
1038             dst_iview->vk.layer_count,
1039          },
1040          .dstOffset = { 0, 0, 0 },
1041          .extent = src_iview->vk.image->extent,
1042       };
1043 
1044       struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
1045       struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
1046       VkResolveImageInfo2KHR resolve_info = {
1047          .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2_KHR,
1048          .srcImage = v3dv_image_to_handle(src_image),
1049          .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1050          .dstImage = v3dv_image_to_handle(dst_image),
1051          .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1052          .regionCount = 1,
1053          .pRegions = &region,
1054       };
1055       v3dv_CmdResolveImage2KHR(cmd_buffer_handle, &resolve_info);
1056    }
1057 
1058    cmd_buffer->state.framebuffer = restore_fb;
1059    cmd_buffer->state.pass = restore_pass;
1060    cmd_buffer->state.subpass_idx = restore_subpass_idx;
1061 }
1062 
1063 static VkResult
cmd_buffer_begin_render_pass_secondary(struct v3dv_cmd_buffer * cmd_buffer,const VkCommandBufferInheritanceInfo * inheritance_info)1064 cmd_buffer_begin_render_pass_secondary(
1065    struct v3dv_cmd_buffer *cmd_buffer,
1066    const VkCommandBufferInheritanceInfo *inheritance_info)
1067 {
1068    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1069    assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
1070    assert(inheritance_info);
1071 
1072    cmd_buffer->state.pass =
1073       v3dv_render_pass_from_handle(inheritance_info->renderPass);
1074    assert(cmd_buffer->state.pass);
1075 
1076    cmd_buffer->state.framebuffer =
1077       v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
1078 
1079    assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
1080    cmd_buffer->state.subpass_idx = inheritance_info->subpass;
1081 
1082    cmd_buffer->state.inheritance.occlusion_query_enable =
1083       inheritance_info->occlusionQueryEnable;
1084 
1085    /* Secondaries that execute inside a render pass won't start subpasses
1086     * so we want to create a job for them here.
1087     */
1088    struct v3dv_job *job =
1089       v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass,
1090                                 V3DV_JOB_TYPE_GPU_CL_SECONDARY);
1091    if (!job) {
1092       v3dv_flag_oom(cmd_buffer, NULL);
1093       return VK_ERROR_OUT_OF_HOST_MEMORY;
1094    }
1095 
1096    /* Secondary command buffers don't know about the render area, but our
1097     * scissor setup accounts for it, so let's make sure we make it large
1098     * enough that it doesn't actually constrain any rendering. This should
1099     * be fine, since the Vulkan spec states:
1100     *
1101     *    "The application must ensure (using scissor if necessary) that all
1102     *     rendering is contained within the render area."
1103     *
1104     * FIXME: setup constants for the max framebuffer dimensions and use them
1105     * here and when filling in VkPhysicalDeviceLimits.
1106     */
1107    const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1108    cmd_buffer->state.render_area.offset.x = 0;
1109    cmd_buffer->state.render_area.offset.y = 0;
1110    cmd_buffer->state.render_area.extent.width =
1111       framebuffer ? framebuffer->width : 4096;
1112    cmd_buffer->state.render_area.extent.height =
1113       framebuffer ? framebuffer->height : 4096;
1114 
1115    return VK_SUCCESS;
1116 }
1117 
1118 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)1119 v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1120                         const VkCommandBufferBeginInfo *pBeginInfo)
1121 {
1122    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1123 
1124    /* If this is the first vkBeginCommandBuffer, we must initialize the
1125     * command buffer's state. Otherwise, we must reset its state. In both
1126     * cases we reset it.
1127     */
1128    VkResult result = cmd_buffer_reset(cmd_buffer, 0);
1129    if (result != VK_SUCCESS)
1130       return result;
1131 
1132    assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
1133 
1134    cmd_buffer->usage_flags = pBeginInfo->flags;
1135 
1136    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1137       if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1138          result =
1139             cmd_buffer_begin_render_pass_secondary(cmd_buffer,
1140                                                    pBeginInfo->pInheritanceInfo);
1141          if (result != VK_SUCCESS)
1142             return result;
1143       }
1144    }
1145 
1146    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING;
1147 
1148    return VK_SUCCESS;
1149 }
1150 
1151 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,VkCommandBufferResetFlags flags)1152 v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1153                         VkCommandBufferResetFlags flags)
1154 {
1155    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1156    return cmd_buffer_reset(cmd_buffer, flags);
1157 }
1158 
1159 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_ResetCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolResetFlags flags)1160 v3dv_ResetCommandPool(VkDevice device,
1161                       VkCommandPool commandPool,
1162                       VkCommandPoolResetFlags flags)
1163 {
1164    V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool);
1165 
1166    VkCommandBufferResetFlags reset_flags = 0;
1167    if (flags & VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT)
1168       reset_flags = VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT;
1169    list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer,
1170                             &pool->cmd_buffers, pool_link) {
1171       cmd_buffer_reset(cmd_buffer, reset_flags);
1172    }
1173 
1174    return VK_SUCCESS;
1175 }
1176 
1177 static void
cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer * cmd_buffer)1178 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
1179 {
1180    /* Render areas and scissor/viewport are only relevant inside render passes,
1181     * otherwise we are dealing with transfer operations where these elements
1182     * don't apply.
1183     */
1184    assert(cmd_buffer->state.pass);
1185    const VkRect2D *rect = &cmd_buffer->state.render_area;
1186 
1187    /* We should only call this at the beginning of a subpass so we should
1188     * always have framebuffer information available.
1189     */
1190    assert(cmd_buffer->state.framebuffer);
1191    cmd_buffer->state.tile_aligned_render_area =
1192       v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, rect,
1193                                         cmd_buffer->state.framebuffer,
1194                                         cmd_buffer->state.pass,
1195                                         cmd_buffer->state.subpass_idx);
1196 
1197    if (!cmd_buffer->state.tile_aligned_render_area) {
1198       perf_debug("Render area for subpass %d of render pass %p doesn't "
1199                  "match render pass granularity.\n",
1200                  cmd_buffer->state.subpass_idx, cmd_buffer->state.pass);
1201    }
1202 }
1203 
1204 static void
cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer * cmd_buffer,uint32_t attachment_idx,const VkClearColorValue * color)1205 cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
1206                                             uint32_t attachment_idx,
1207                                             const VkClearColorValue *color)
1208 {
1209    assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
1210 
1211    const struct v3dv_render_pass_attachment *attachment =
1212       &cmd_buffer->state.pass->attachments[attachment_idx];
1213 
1214    uint32_t internal_type, internal_bpp;
1215    const struct v3dv_format *format =
1216       v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format);
1217 
1218    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format)
1219       (format->rt_type, &internal_type, &internal_bpp);
1220 
1221    uint32_t internal_size = 4 << internal_bpp;
1222 
1223    struct v3dv_cmd_buffer_attachment_state *attachment_state =
1224       &cmd_buffer->state.attachments[attachment_idx];
1225 
1226    v3dv_X(cmd_buffer->device, get_hw_clear_color)
1227       (color, internal_type, internal_size, &attachment_state->clear_value.color[0]);
1228 
1229    attachment_state->vk_clear_value.color = *color;
1230 }
1231 
1232 static void
cmd_buffer_state_set_attachment_clear_depth_stencil(struct v3dv_cmd_buffer * cmd_buffer,uint32_t attachment_idx,bool clear_depth,bool clear_stencil,const VkClearDepthStencilValue * ds)1233 cmd_buffer_state_set_attachment_clear_depth_stencil(
1234    struct v3dv_cmd_buffer *cmd_buffer,
1235    uint32_t attachment_idx,
1236    bool clear_depth, bool clear_stencil,
1237    const VkClearDepthStencilValue *ds)
1238 {
1239    struct v3dv_cmd_buffer_attachment_state *attachment_state =
1240       &cmd_buffer->state.attachments[attachment_idx];
1241 
1242    if (clear_depth)
1243       attachment_state->clear_value.z = ds->depth;
1244 
1245    if (clear_stencil)
1246       attachment_state->clear_value.s = ds->stencil;
1247 
1248    attachment_state->vk_clear_value.depthStencil = *ds;
1249 }
1250 
1251 static void
cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer * cmd_buffer,uint32_t count,const VkClearValue * values)1252 cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer,
1253                                   uint32_t count, const VkClearValue *values)
1254 {
1255    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1256    const struct v3dv_render_pass *pass = state->pass;
1257 
1258    /* There could be less clear values than attachments in the render pass, in
1259     * which case we only want to process as many as we have, or there could be
1260     * more, in which case we want to ignore those for which we don't have a
1261     * corresponding attachment.
1262     */
1263    count = MIN2(count, pass->attachment_count);
1264    for (uint32_t i = 0; i < count; i++) {
1265       const struct v3dv_render_pass_attachment *attachment =
1266          &pass->attachments[i];
1267 
1268       if (attachment->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1269          continue;
1270 
1271       VkImageAspectFlags aspects = vk_format_aspects(attachment->desc.format);
1272       if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
1273          cmd_buffer_state_set_attachment_clear_color(cmd_buffer, i,
1274                                                      &values[i].color);
1275       } else if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
1276                             VK_IMAGE_ASPECT_STENCIL_BIT)) {
1277          cmd_buffer_state_set_attachment_clear_depth_stencil(
1278             cmd_buffer, i,
1279             aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1280             aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1281             &values[i].depthStencil);
1282       }
1283    }
1284 }
1285 
1286 static void
cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)1287 cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer,
1288                                              const VkRenderPassBeginInfo *pRenderPassBegin)
1289 {
1290    cmd_buffer_state_set_clear_values(cmd_buffer,
1291                                      pRenderPassBegin->clearValueCount,
1292                                      pRenderPassBegin->pClearValues);
1293 }
1294 
1295 static void
cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer * cmd_buffer)1296 cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer)
1297 {
1298    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1299    const struct v3dv_render_pass *pass = state->pass;
1300 
1301    if (state->attachment_alloc_count < pass->attachment_count) {
1302       if (state->attachments > 0) {
1303          assert(state->attachment_alloc_count > 0);
1304          vk_free(&cmd_buffer->device->vk.alloc, state->attachments);
1305       }
1306 
1307       uint32_t size = sizeof(struct v3dv_cmd_buffer_attachment_state) *
1308                       pass->attachment_count;
1309       state->attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, size, 8,
1310                                      VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1311       if (!state->attachments) {
1312          v3dv_flag_oom(cmd_buffer, NULL);
1313          return;
1314       }
1315       state->attachment_alloc_count = pass->attachment_count;
1316    }
1317 
1318    assert(state->attachment_alloc_count >= pass->attachment_count);
1319 }
1320 
1321 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBegin,VkSubpassContents contents)1322 v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
1323                         const VkRenderPassBeginInfo *pRenderPassBegin,
1324                         VkSubpassContents contents)
1325 {
1326    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1327    V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
1328    V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1329 
1330    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1331    state->pass = pass;
1332    state->framebuffer = framebuffer;
1333 
1334    cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);
1335    v3dv_return_if_oom(cmd_buffer, NULL);
1336 
1337    cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin);
1338 
1339    state->render_area = pRenderPassBegin->renderArea;
1340 
1341    /* If our render area is smaller than the current clip window we will have
1342     * to emit a new clip window to constraint it to the render area.
1343     */
1344    uint32_t min_render_x = state->render_area.offset.x;
1345    uint32_t min_render_y = state->render_area.offset.y;
1346    uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
1347    uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
1348    uint32_t min_clip_x = state->clip_window.offset.x;
1349    uint32_t min_clip_y = state->clip_window.offset.y;
1350    uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;
1351    uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;
1352    if (min_render_x > min_clip_x || min_render_y > min_clip_y ||
1353        max_render_x < max_clip_x || max_render_y < max_clip_y) {
1354       state->dirty |= V3DV_CMD_DIRTY_SCISSOR;
1355    }
1356 
1357    /* Setup for first subpass */
1358    v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
1359 }
1360 
1361 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer,VkSubpassContents contents)1362 v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
1363 {
1364    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1365 
1366    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1367    assert(state->subpass_idx < state->pass->subpass_count - 1);
1368 
1369    /* Finish the previous subpass */
1370    v3dv_cmd_buffer_subpass_finish(cmd_buffer);
1371    cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
1372 
1373    /* Start the next subpass */
1374    v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1);
1375 }
1376 
1377 static void
cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer * cmd_buffer)1378 cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
1379 {
1380    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1381 
1382    assert(cmd_buffer->state.pass);
1383    assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
1384    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1385    const struct v3dv_render_pass *pass = state->pass;
1386    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1387 
1388    /* We only need to emit subpass clears as draw calls when the render
1389     * area is not aligned to tile boundaries or for GFXH-1461.
1390     */
1391    if (cmd_buffer->state.tile_aligned_render_area &&
1392        !subpass->do_depth_clear_with_draw &&
1393        !subpass->do_depth_clear_with_draw) {
1394       return;
1395    }
1396 
1397    uint32_t att_count = 0;
1398    VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
1399 
1400    /* We only need to emit subpass clears as draw calls for color attachments
1401     * if the render area is not aligned to tile boundaries.
1402     */
1403    if (!cmd_buffer->state.tile_aligned_render_area) {
1404       for (uint32_t i = 0; i < subpass->color_count; i++) {
1405          const uint32_t att_idx = subpass->color_attachments[i].attachment;
1406          if (att_idx == VK_ATTACHMENT_UNUSED)
1407             continue;
1408 
1409          struct v3dv_render_pass_attachment *att = &pass->attachments[att_idx];
1410          if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1411             continue;
1412 
1413          if (state->subpass_idx != att->first_subpass)
1414             continue;
1415 
1416          atts[att_count].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
1417          atts[att_count].colorAttachment = i;
1418          atts[att_count].clearValue = state->attachments[att_idx].vk_clear_value;
1419          att_count++;
1420       }
1421    }
1422 
1423    /* For D/S we may also need to emit a subpass clear for GFXH-1461 */
1424    const uint32_t ds_att_idx = subpass->ds_attachment.attachment;
1425    if (ds_att_idx != VK_ATTACHMENT_UNUSED) {
1426       struct v3dv_render_pass_attachment *att = &pass->attachments[ds_att_idx];
1427       if (state->subpass_idx == att->first_subpass) {
1428          VkImageAspectFlags aspects = vk_format_aspects(att->desc.format);
1429          if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
1430              (cmd_buffer->state.tile_aligned_render_area &&
1431               !subpass->do_depth_clear_with_draw)) {
1432             aspects &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
1433          }
1434          if (att->desc.stencilLoadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
1435              (cmd_buffer->state.tile_aligned_render_area &&
1436               !subpass->do_stencil_clear_with_draw)) {
1437             aspects &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
1438          }
1439          if (aspects) {
1440             atts[att_count].aspectMask = aspects;
1441             atts[att_count].colorAttachment = 0; /* Ignored */
1442             atts[att_count].clearValue =
1443                state->attachments[ds_att_idx].vk_clear_value;
1444             att_count++;
1445          }
1446       }
1447    }
1448 
1449    if (att_count == 0)
1450       return;
1451 
1452    if (!cmd_buffer->state.tile_aligned_render_area) {
1453       perf_debug("Render area doesn't match render pass granularity, falling "
1454                  "back to vkCmdClearAttachments for "
1455                  "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
1456    } else if (subpass->do_depth_clear_with_draw ||
1457               subpass->do_stencil_clear_with_draw) {
1458       perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), "
1459                  "falling back to vkCmdClearAttachments for "
1460                  "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
1461    }
1462 
1463    /* From the Vulkan 1.0 spec:
1464     *
1465     *    "VK_ATTACHMENT_LOAD_OP_CLEAR specifies that the contents within the
1466     *     render area will be cleared to a uniform value, which is specified
1467     *     when a render pass instance is begun."
1468     *
1469     * So the clear is only constrained by the render area and not by pipeline
1470     * state such as scissor or viewport, these are the semantics of
1471     * vkCmdClearAttachments as well.
1472     */
1473    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
1474    VkClearRect rect = {
1475       .rect = state->render_area,
1476       .baseArrayLayer = 0,
1477       .layerCount = 1,
1478    };
1479    v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect);
1480 }
1481 
1482 static struct v3dv_job *
cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx,enum v3dv_job_type type)1483 cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
1484                               uint32_t subpass_idx,
1485                               enum v3dv_job_type type)
1486 {
1487    assert(type == V3DV_JOB_TYPE_GPU_CL ||
1488           type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
1489 
1490    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1491    assert(subpass_idx < state->pass->subpass_count);
1492 
1493    /* Starting a new job can trigger a finish of the current one, so don't
1494     * change the command buffer state for the new job until we are done creating
1495     * the new job.
1496     */
1497    struct v3dv_job *job =
1498       v3dv_cmd_buffer_start_job(cmd_buffer, subpass_idx, type);
1499    if (!job)
1500       return NULL;
1501 
1502    state->subpass_idx = subpass_idx;
1503 
1504    /* If we are starting a new job we need to setup binning. We only do this
1505     * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY
1506     * jobs are not submitted to the GPU directly, and are instead meant to be
1507     * branched to from other V3DV_JOB_TYPE_GPU_CL jobs.
1508     */
1509    if (type == V3DV_JOB_TYPE_GPU_CL &&
1510        job->first_subpass == state->subpass_idx) {
1511       const struct v3dv_subpass *subpass =
1512          &state->pass->subpasses[state->subpass_idx];
1513 
1514       const struct v3dv_framebuffer *framebuffer = state->framebuffer;
1515 
1516       uint8_t internal_bpp;
1517       bool msaa;
1518       v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
1519          (framebuffer, subpass, &internal_bpp, &msaa);
1520 
1521       /* From the Vulkan spec:
1522        *
1523        *    "If the render pass uses multiview, then layers must be one and
1524        *     each attachment requires a number of layers that is greater than
1525        *     the maximum bit index set in the view mask in the subpasses in
1526        *     which it is used."
1527        *
1528        * So when multiview is enabled, we take the number of layers from the
1529        * last bit set in the view mask.
1530        */
1531       uint32_t layers = framebuffer->layers;
1532       if (subpass->view_mask != 0) {
1533          assert(framebuffer->layers == 1);
1534          layers = util_last_bit(subpass->view_mask);
1535       }
1536 
1537       v3dv_job_start_frame(job,
1538                            framebuffer->width,
1539                            framebuffer->height,
1540                            layers,
1541                            true,
1542                            subpass->color_count,
1543                            internal_bpp,
1544                            msaa);
1545    }
1546 
1547    return job;
1548 }
1549 
1550 struct v3dv_job *
v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)1551 v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
1552                               uint32_t subpass_idx)
1553 {
1554    assert(cmd_buffer->state.pass);
1555    assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
1556 
1557    struct v3dv_job *job =
1558       cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1559                                     V3DV_JOB_TYPE_GPU_CL);
1560    if (!job)
1561       return NULL;
1562 
1563    /* Check if our render area is aligned to tile boundaries. We have to do
1564     * this in each subpass because the subset of attachments used can change
1565     * and with that the tile size selected by the hardware can change too.
1566     */
1567    cmd_buffer_update_tile_alignment(cmd_buffer);
1568 
1569    /* If we can't use TLB clears then we need to emit draw clears for any
1570     * LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit
1571     * Depth/Stencil clears if we hit GFXH-1461.
1572     *
1573     * Secondary command buffers don't start subpasses (and may not even have
1574     * framebuffer state), so we only care about this in primaries. The only
1575     * exception could be a secondary runnning inside a subpass that needs to
1576     * record a meta operation (with its own render pass) that relies on
1577     * attachment load clears, but we don't have any instances of that right
1578     * now.
1579     */
1580    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1581       cmd_buffer_emit_subpass_clears(cmd_buffer);
1582 
1583    return job;
1584 }
1585 
1586 struct v3dv_job *
v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)1587 v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer,
1588                                uint32_t subpass_idx)
1589 {
1590    assert(cmd_buffer->state.pass);
1591    assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
1592 
1593    struct v3dv_job *job;
1594    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1595       job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1596                                           V3DV_JOB_TYPE_GPU_CL);
1597    } else {
1598       assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1599       job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1600                                           V3DV_JOB_TYPE_GPU_CL_SECONDARY);
1601    }
1602 
1603    if (!job)
1604       return NULL;
1605 
1606    job->is_subpass_continue = true;
1607 
1608    return job;
1609 }
1610 
1611 void
v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer * cmd_buffer)1612 v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
1613 {
1614    /* We can end up here without a job if the last command recorded into the
1615     * subpass already finished the job (for example a pipeline barrier). In
1616     * that case we miss to set the is_subpass_finish flag, but that is not
1617     * required for proper behavior.
1618     */
1619    struct v3dv_job *job = cmd_buffer->state.job;
1620    if (job)
1621       job->is_subpass_finish = true;
1622 }
1623 
1624 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)1625 v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)
1626 {
1627    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1628 
1629    /* Finalize last subpass */
1630    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1631    assert(state->subpass_idx == state->pass->subpass_count - 1);
1632    v3dv_cmd_buffer_subpass_finish(cmd_buffer);
1633    v3dv_cmd_buffer_finish_job(cmd_buffer);
1634 
1635    cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
1636 
1637    /* We are no longer inside a render pass */
1638    state->framebuffer = NULL;
1639    state->pass = NULL;
1640    state->subpass_idx = -1;
1641 }
1642 
1643 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)1644 v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
1645 {
1646    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1647 
1648    if (cmd_buffer->state.oom)
1649       return VK_ERROR_OUT_OF_HOST_MEMORY;
1650 
1651    /* Primaries should have ended any recording jobs by the time they hit
1652     * vkEndRenderPass (if we are inside a render pass). Commands outside
1653     * a render pass instance (for both primaries and secondaries) spawn
1654     * complete jobs too. So the only case where we can get here without
1655     * finishing a recording job is when we are recording a secondary
1656     * inside a render pass.
1657     */
1658    if (cmd_buffer->state.job) {
1659       assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
1660              cmd_buffer->state.pass);
1661       v3dv_cmd_buffer_finish_job(cmd_buffer);
1662    }
1663 
1664    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE;
1665 
1666    return VK_SUCCESS;
1667 }
1668 
1669 static void
clone_bo_list(struct v3dv_cmd_buffer * cmd_buffer,struct list_head * dst,struct list_head * src)1670 clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,
1671               struct list_head *dst,
1672               struct list_head *src)
1673 {
1674    assert(cmd_buffer);
1675 
1676    list_inithead(dst);
1677    list_for_each_entry(struct v3dv_bo, bo, src, list_link) {
1678       struct v3dv_bo *clone_bo =
1679          vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct v3dv_bo), 8,
1680                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1681       if (!clone_bo) {
1682          v3dv_flag_oom(cmd_buffer, NULL);
1683          return;
1684       }
1685 
1686       *clone_bo = *bo;
1687       list_addtail(&clone_bo->list_link, dst);
1688    }
1689 }
1690 
1691 /* Clones a job for inclusion in the given command buffer. Note that this
1692  * doesn't make a deep copy so the cloned job it doesn't own any resources.
1693  * Useful when we need to have a job in more than one list, which happens
1694  * for jobs recorded in secondary command buffers when we want to execute
1695  * them in primaries.
1696  */
1697 struct v3dv_job *
v3dv_job_clone_in_cmd_buffer(struct v3dv_job * job,struct v3dv_cmd_buffer * cmd_buffer)1698 v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
1699                              struct v3dv_cmd_buffer *cmd_buffer)
1700 {
1701    struct v3dv_job *clone_job = vk_alloc(&job->device->vk.alloc,
1702                                          sizeof(struct v3dv_job), 8,
1703                                          VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1704    if (!clone_job) {
1705       v3dv_flag_oom(cmd_buffer, NULL);
1706       return NULL;
1707    }
1708 
1709    /* Cloned jobs don't duplicate resources! */
1710    *clone_job = *job;
1711    clone_job->is_clone = true;
1712    clone_job->cmd_buffer = cmd_buffer;
1713    list_addtail(&clone_job->list_link, &cmd_buffer->jobs);
1714 
1715    /* We need to regen the BO lists so that they point to the BO list in the
1716     * cloned job. Otherwise functions like list_length() will loop forever.
1717     */
1718    if (job->type == V3DV_JOB_TYPE_GPU_CL) {
1719       clone_bo_list(cmd_buffer, &clone_job->bcl.bo_list, &job->bcl.bo_list);
1720       clone_bo_list(cmd_buffer, &clone_job->rcl.bo_list, &job->rcl.bo_list);
1721       clone_bo_list(cmd_buffer, &clone_job->indirect.bo_list,
1722                     &job->indirect.bo_list);
1723    }
1724 
1725    return clone_job;
1726 }
1727 
1728 static void
cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer * primary,uint32_t cmd_buffer_count,const VkCommandBuffer * cmd_buffers)1729 cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
1730                                 uint32_t cmd_buffer_count,
1731                                 const VkCommandBuffer *cmd_buffers)
1732 {
1733    bool pending_barrier = false;
1734    bool pending_bcl_barrier = false;
1735    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
1736       V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
1737 
1738       assert(!(secondary->usage_flags &
1739                VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
1740 
1741       /* Secondary command buffers that execute outside a render pass create
1742        * complete jobs with an RCL and tile setup, so we simply want to merge
1743        * their job list into the primary's. However, because they may be
1744        * executed into multiple primaries at the same time and we only have a
1745        * single list_link in each job, we can't just add then to the primary's
1746        * job list and we instead have to clone them first.
1747        *
1748        * Alternatively, we could create a "execute secondary" CPU job that
1749        * when executed in a queue, would submit all the jobs in the referenced
1750        * secondary command buffer. However, this would raise some challenges
1751        * to make it work with the implementation of wait threads in the queue
1752        * which we use for event waits, for example.
1753        */
1754       list_for_each_entry(struct v3dv_job, secondary_job,
1755                           &secondary->jobs, list_link) {
1756          /* These can only happen inside a render pass */
1757          assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY);
1758          struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
1759          if (!job)
1760             return;
1761 
1762          if (pending_barrier) {
1763             job->serialize = true;
1764             if (pending_bcl_barrier)
1765                job->needs_bcl_sync = true;
1766             pending_barrier = false;
1767             pending_bcl_barrier = false;
1768          }
1769       }
1770 
1771       /* If this secondary had any pending barrier state we will need that
1772        * barrier state consumed with whatever comes after it (first job in
1773        * the next secondary or the primary, if this was the last secondary).
1774        */
1775       assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);
1776       pending_barrier = secondary->state.has_barrier;
1777       pending_bcl_barrier = secondary->state.has_bcl_barrier;
1778    }
1779 
1780    if (pending_barrier) {
1781       primary->state.has_barrier = true;
1782       primary->state.has_bcl_barrier |= pending_bcl_barrier;
1783    }
1784 }
1785 
1786 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)1787 v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
1788                         uint32_t commandBufferCount,
1789                         const VkCommandBuffer *pCommandBuffers)
1790 {
1791    V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer);
1792 
1793    if (primary->state.pass != NULL) {
1794       v3dv_X(primary->device, cmd_buffer_execute_inside_pass)
1795          (primary, commandBufferCount, pCommandBuffers);
1796    } else {
1797       cmd_buffer_execute_outside_pass(primary,
1798                                       commandBufferCount, pCommandBuffers);
1799    }
1800 }
1801 
1802 /* This goes though the list of possible dynamic states in the pipeline and,
1803  * for those that are not configured as dynamic, copies relevant state into
1804  * the command buffer.
1805  */
1806 static void
cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer * cmd_buffer,const struct v3dv_dynamic_state * src)1807 cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
1808                                       const struct v3dv_dynamic_state *src)
1809 {
1810    struct v3dv_dynamic_state *dest = &cmd_buffer->state.dynamic;
1811    uint32_t dynamic_mask = src->mask;
1812    uint32_t dirty = 0;
1813 
1814    if (!(dynamic_mask & V3DV_DYNAMIC_VIEWPORT)) {
1815       dest->viewport.count = src->viewport.count;
1816       if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
1817                  src->viewport.count * sizeof(VkViewport))) {
1818          typed_memcpy(dest->viewport.viewports,
1819                       src->viewport.viewports,
1820                       src->viewport.count);
1821          typed_memcpy(dest->viewport.scale, src->viewport.scale,
1822                       src->viewport.count);
1823          typed_memcpy(dest->viewport.translate, src->viewport.translate,
1824                       src->viewport.count);
1825          dirty |= V3DV_CMD_DIRTY_VIEWPORT;
1826       }
1827    }
1828 
1829    if (!(dynamic_mask & V3DV_DYNAMIC_SCISSOR)) {
1830       dest->scissor.count = src->scissor.count;
1831       if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
1832                  src->scissor.count * sizeof(VkRect2D))) {
1833          typed_memcpy(dest->scissor.scissors,
1834                       src->scissor.scissors, src->scissor.count);
1835          dirty |= V3DV_CMD_DIRTY_SCISSOR;
1836       }
1837    }
1838 
1839    if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {
1840       if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
1841                  sizeof(src->stencil_compare_mask))) {
1842          dest->stencil_compare_mask = src->stencil_compare_mask;
1843          dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
1844       }
1845    }
1846 
1847    if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {
1848       if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
1849                  sizeof(src->stencil_write_mask))) {
1850          dest->stencil_write_mask = src->stencil_write_mask;
1851          dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
1852       }
1853    }
1854 
1855    if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_REFERENCE)) {
1856       if (memcmp(&dest->stencil_reference, &src->stencil_reference,
1857                  sizeof(src->stencil_reference))) {
1858          dest->stencil_reference = src->stencil_reference;
1859          dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
1860       }
1861    }
1862 
1863    if (!(dynamic_mask & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
1864       if (memcmp(dest->blend_constants, src->blend_constants,
1865                  sizeof(src->blend_constants))) {
1866          memcpy(dest->blend_constants, src->blend_constants,
1867                 sizeof(src->blend_constants));
1868          dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
1869       }
1870    }
1871 
1872    if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BIAS)) {
1873       if (memcmp(&dest->depth_bias, &src->depth_bias,
1874                  sizeof(src->depth_bias))) {
1875          memcpy(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias));
1876          dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
1877       }
1878    }
1879 
1880    if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
1881       if (dest->line_width != src->line_width) {
1882          dest->line_width = src->line_width;
1883          dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
1884       }
1885    }
1886 
1887    if (!(dynamic_mask & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) {
1888       if (dest->color_write_enable != src->color_write_enable) {
1889          dest->color_write_enable = src->color_write_enable;
1890          dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
1891       }
1892    }
1893 
1894    cmd_buffer->state.dynamic.mask = dynamic_mask;
1895    cmd_buffer->state.dirty |= dirty;
1896 }
1897 
1898 static void
bind_graphics_pipeline(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)1899 bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
1900                        struct v3dv_pipeline *pipeline)
1901 {
1902    assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
1903    if (cmd_buffer->state.gfx.pipeline == pipeline)
1904       return;
1905 
1906    cmd_buffer->state.gfx.pipeline = pipeline;
1907 
1908    cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state);
1909 
1910    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
1911 }
1912 
1913 static void
bind_compute_pipeline(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)1914 bind_compute_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
1915                       struct v3dv_pipeline *pipeline)
1916 {
1917    assert(pipeline && pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
1918 
1919    if (cmd_buffer->state.compute.pipeline == pipeline)
1920       return;
1921 
1922    cmd_buffer->state.compute.pipeline = pipeline;
1923    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_PIPELINE;
1924 }
1925 
1926 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)1927 v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
1928                      VkPipelineBindPoint pipelineBindPoint,
1929                      VkPipeline _pipeline)
1930 {
1931    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1932    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline);
1933 
1934    switch (pipelineBindPoint) {
1935    case VK_PIPELINE_BIND_POINT_COMPUTE:
1936       bind_compute_pipeline(cmd_buffer, pipeline);
1937       break;
1938 
1939    case VK_PIPELINE_BIND_POINT_GRAPHICS:
1940       bind_graphics_pipeline(cmd_buffer, pipeline);
1941       break;
1942 
1943    default:
1944       assert(!"invalid bind point");
1945       break;
1946    }
1947 }
1948 
1949 /* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
1950 void
v3dv_viewport_compute_xform(const VkViewport * viewport,float scale[3],float translate[3])1951 v3dv_viewport_compute_xform(const VkViewport *viewport,
1952                             float scale[3],
1953                             float translate[3])
1954 {
1955    float x = viewport->x;
1956    float y = viewport->y;
1957    float half_width = 0.5f * viewport->width;
1958    float half_height = 0.5f * viewport->height;
1959    double n = viewport->minDepth;
1960    double f = viewport->maxDepth;
1961 
1962    scale[0] = half_width;
1963    translate[0] = half_width + x;
1964    scale[1] = half_height;
1965    translate[1] = half_height + y;
1966 
1967    scale[2] = (f - n);
1968    translate[2] = n;
1969 
1970    /* It seems that if the scale is small enough the hardware won't clip
1971     * correctly so we work around this my choosing the smallest scale that
1972     * seems to work.
1973     *
1974     * This case is exercised by CTS:
1975     * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
1976     */
1977    const float min_abs_scale = 0.000009f;
1978    if (fabs(scale[2]) < min_abs_scale)
1979       scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f);
1980 }
1981 
1982 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)1983 v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
1984                     uint32_t firstViewport,
1985                     uint32_t viewportCount,
1986                     const VkViewport *pViewports)
1987 {
1988    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1989    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1990    const uint32_t total_count = firstViewport + viewportCount;
1991 
1992    assert(firstViewport < MAX_VIEWPORTS);
1993    assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
1994 
1995    if (state->dynamic.viewport.count < total_count)
1996       state->dynamic.viewport.count = total_count;
1997 
1998    if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
1999                pViewports, viewportCount * sizeof(*pViewports))) {
2000       return;
2001    }
2002 
2003    memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
2004           viewportCount * sizeof(*pViewports));
2005 
2006    for (uint32_t i = firstViewport; i < total_count; i++) {
2007       v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
2008                                   state->dynamic.viewport.scale[i],
2009                                   state->dynamic.viewport.translate[i]);
2010    }
2011 
2012    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
2013 }
2014 
2015 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)2016 v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,
2017                    uint32_t firstScissor,
2018                    uint32_t scissorCount,
2019                    const VkRect2D *pScissors)
2020 {
2021    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2022    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2023 
2024    assert(firstScissor < MAX_SCISSORS);
2025    assert(firstScissor + scissorCount >= 1 &&
2026           firstScissor + scissorCount <= MAX_SCISSORS);
2027 
2028    if (state->dynamic.scissor.count < firstScissor + scissorCount)
2029       state->dynamic.scissor.count = firstScissor + scissorCount;
2030 
2031    if (!memcmp(state->dynamic.scissor.scissors + firstScissor,
2032                pScissors, scissorCount * sizeof(*pScissors))) {
2033       return;
2034    }
2035 
2036    memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
2037           scissorCount * sizeof(*pScissors));
2038 
2039    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_SCISSOR;
2040 }
2041 
2042 static void
emit_scissor(struct v3dv_cmd_buffer * cmd_buffer)2043 emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
2044 {
2045    if (cmd_buffer->state.dynamic.viewport.count == 0)
2046       return;
2047 
2048    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
2049 
2050    /* FIXME: right now we only support one viewport. viewporst[0] would work
2051     * now, but would need to change if we allow multiple viewports.
2052     */
2053    float *vptranslate = dynamic->viewport.translate[0];
2054    float *vpscale = dynamic->viewport.scale[0];
2055 
2056    float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
2057    float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
2058    float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
2059    float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
2060 
2061    /* Quoting from v3dx_emit:
2062     * "Clip to the scissor if it's enabled, but still clip to the
2063     * drawable regardless since that controls where the binner
2064     * tries to put things.
2065     *
2066     * Additionally, always clip the rendering to the viewport,
2067     * since the hardware does guardband clipping, meaning
2068     * primitives would rasterize outside of the view volume."
2069     */
2070    uint32_t minx, miny, maxx, maxy;
2071 
2072    /* From the Vulkan spec:
2073     *
2074     * "The application must ensure (using scissor if necessary) that all
2075     *  rendering is contained within the render area. The render area must be
2076     *  contained within the framebuffer dimensions."
2077     *
2078     * So it is the application's responsibility to ensure this. Still, we can
2079     * help by automatically restricting the scissor rect to the render area.
2080     */
2081    minx = MAX2(vp_minx, cmd_buffer->state.render_area.offset.x);
2082    miny = MAX2(vp_miny, cmd_buffer->state.render_area.offset.y);
2083    maxx = MIN2(vp_maxx, cmd_buffer->state.render_area.offset.x +
2084                         cmd_buffer->state.render_area.extent.width);
2085    maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y +
2086                         cmd_buffer->state.render_area.extent.height);
2087 
2088    minx = vp_minx;
2089    miny = vp_miny;
2090    maxx = vp_maxx;
2091    maxy = vp_maxy;
2092 
2093    /* Clip against user provided scissor if needed.
2094     *
2095     * FIXME: right now we only allow one scissor. Below would need to be
2096     * updated if we support more
2097     */
2098    if (dynamic->scissor.count > 0) {
2099       VkRect2D *scissor = &dynamic->scissor.scissors[0];
2100       minx = MAX2(minx, scissor->offset.x);
2101       miny = MAX2(miny, scissor->offset.y);
2102       maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width);
2103       maxy = MIN2(maxy, scissor->offset.y + scissor->extent.height);
2104    }
2105 
2106    /* If the scissor is outside the viewport area we end up with
2107     * min{x,y} > max{x,y}.
2108     */
2109    if (minx > maxx)
2110       maxx = minx;
2111    if (miny > maxy)
2112       maxy = miny;
2113 
2114    cmd_buffer->state.clip_window.offset.x = minx;
2115    cmd_buffer->state.clip_window.offset.y = miny;
2116    cmd_buffer->state.clip_window.extent.width = maxx - minx;
2117    cmd_buffer->state.clip_window.extent.height = maxy - miny;
2118 
2119    v3dv_X(cmd_buffer->device, job_emit_clip_window)
2120       (cmd_buffer->state.job, &cmd_buffer->state.clip_window);
2121 
2122    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR;
2123 }
2124 
2125 static void
update_gfx_uniform_state(struct v3dv_cmd_buffer * cmd_buffer,uint32_t dirty_uniform_state)2126 update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
2127                          uint32_t dirty_uniform_state)
2128 {
2129    /* We need to update uniform streams if any piece of state that is passed
2130     * to the shader as a uniform may have changed.
2131     *
2132     * If only descriptor sets are dirty then we can safely ignore updates
2133     * for shader stages that don't access descriptors.
2134     */
2135 
2136    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2137    assert(pipeline);
2138 
2139    const bool has_new_pipeline = dirty_uniform_state & V3DV_CMD_DIRTY_PIPELINE;
2140    const bool has_new_viewport = dirty_uniform_state & V3DV_CMD_DIRTY_VIEWPORT;
2141    const bool has_new_push_constants = dirty_uniform_state & V3DV_CMD_DIRTY_PUSH_CONSTANTS;
2142    const bool has_new_descriptors = dirty_uniform_state & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
2143    const bool has_new_view_index = dirty_uniform_state & V3DV_CMD_DIRTY_VIEW_INDEX;
2144 
2145    /* VK_SHADER_STAGE_FRAGMENT_BIT */
2146    const bool has_new_descriptors_fs =
2147       has_new_descriptors &&
2148       (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
2149 
2150    const bool has_new_push_constants_fs =
2151       has_new_push_constants &&
2152       (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
2153 
2154    const bool needs_fs_update = has_new_pipeline ||
2155                                 has_new_view_index ||
2156                                 has_new_push_constants_fs ||
2157                                 has_new_descriptors_fs ||
2158                                 has_new_view_index;
2159 
2160    if (needs_fs_update) {
2161       struct v3dv_shader_variant *fs_variant =
2162          pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2163 
2164       cmd_buffer->state.uniforms.fs =
2165          v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant);
2166    }
2167 
2168    /* VK_SHADER_STAGE_GEOMETRY_BIT */
2169    if (pipeline->has_gs) {
2170       const bool has_new_descriptors_gs =
2171          has_new_descriptors &&
2172          (cmd_buffer->state.dirty_descriptor_stages &
2173           VK_SHADER_STAGE_GEOMETRY_BIT);
2174 
2175       const bool has_new_push_constants_gs =
2176          has_new_push_constants &&
2177          (cmd_buffer->state.dirty_push_constants_stages &
2178           VK_SHADER_STAGE_GEOMETRY_BIT);
2179 
2180       const bool needs_gs_update = has_new_viewport ||
2181                                    has_new_view_index ||
2182                                    has_new_pipeline ||
2183                                    has_new_push_constants_gs ||
2184                                    has_new_descriptors_gs;
2185 
2186       if (needs_gs_update) {
2187          struct v3dv_shader_variant *gs_variant =
2188             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
2189 
2190           struct v3dv_shader_variant *gs_bin_variant =
2191             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2192 
2193          cmd_buffer->state.uniforms.gs =
2194             v3dv_write_uniforms(cmd_buffer, pipeline, gs_variant);
2195 
2196          cmd_buffer->state.uniforms.gs_bin =
2197             v3dv_write_uniforms(cmd_buffer, pipeline, gs_bin_variant);
2198       }
2199    }
2200 
2201    /* VK_SHADER_STAGE_VERTEX_BIT */
2202    const bool has_new_descriptors_vs =
2203       has_new_descriptors &&
2204       (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_VERTEX_BIT);
2205 
2206    const bool has_new_push_constants_vs =
2207       has_new_push_constants &&
2208       (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_VERTEX_BIT);
2209 
2210    const bool needs_vs_update = has_new_viewport ||
2211                                 has_new_view_index ||
2212                                 has_new_pipeline ||
2213                                 has_new_push_constants_vs ||
2214                                 has_new_descriptors_vs;
2215 
2216    if (needs_vs_update) {
2217       struct v3dv_shader_variant *vs_variant =
2218          pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2219 
2220        struct v3dv_shader_variant *vs_bin_variant =
2221          pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
2222 
2223       cmd_buffer->state.uniforms.vs =
2224          v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant);
2225 
2226       cmd_buffer->state.uniforms.vs_bin =
2227          v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant);
2228    }
2229 
2230    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX;
2231 }
2232 
2233 /* This stores command buffer state that we might be about to stomp for
2234  * a meta operation.
2235  */
2236 void
v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer * cmd_buffer,bool push_descriptor_state)2237 v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
2238                                 bool push_descriptor_state)
2239 {
2240    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2241 
2242    if (state->subpass_idx != -1) {
2243       state->meta.subpass_idx = state->subpass_idx;
2244       state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer);
2245       state->meta.pass = v3dv_render_pass_to_handle(state->pass);
2246 
2247       const uint32_t attachment_state_item_size =
2248          sizeof(struct v3dv_cmd_buffer_attachment_state);
2249       const uint32_t attachment_state_total_size =
2250          attachment_state_item_size * state->attachment_alloc_count;
2251       if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {
2252          if (state->meta.attachment_alloc_count > 0)
2253             vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments);
2254 
2255          state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,
2256                                              attachment_state_total_size, 8,
2257                                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2258          if (!state->meta.attachments) {
2259             v3dv_flag_oom(cmd_buffer, NULL);
2260             return;
2261          }
2262          state->meta.attachment_alloc_count = state->attachment_alloc_count;
2263       }
2264       state->meta.attachment_count = state->attachment_alloc_count;
2265       memcpy(state->meta.attachments, state->attachments,
2266              attachment_state_total_size);
2267 
2268       state->meta.tile_aligned_render_area = state->tile_aligned_render_area;
2269       memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D));
2270    }
2271 
2272    /* We expect that meta operations are graphics-only, so we only take into
2273     * account the graphics pipeline, and the graphics state
2274     */
2275    state->meta.gfx.pipeline = state->gfx.pipeline;
2276    memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic));
2277 
2278    struct v3dv_descriptor_state *gfx_descriptor_state =
2279       &cmd_buffer->state.gfx.descriptor_state;
2280 
2281    if (push_descriptor_state) {
2282       if (gfx_descriptor_state->valid != 0) {
2283          memcpy(&state->meta.gfx.descriptor_state, gfx_descriptor_state,
2284                 sizeof(state->gfx.descriptor_state));
2285       }
2286       state->meta.has_descriptor_state = true;
2287    } else {
2288       state->meta.has_descriptor_state = false;
2289    }
2290 
2291    /* FIXME: if we keep track of wether we have bound any push constant state
2292     *        at all we could restruct this only to cases where it is actually
2293     *        necessary.
2294     */
2295    memcpy(state->meta.push_constants, cmd_buffer->push_constants_data,
2296           sizeof(state->meta.push_constants));
2297 }
2298 
2299 /* This restores command buffer state after a meta operation
2300  */
2301 void
v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer * cmd_buffer,uint32_t dirty_dynamic_state,bool needs_subpass_resume)2302 v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
2303                                uint32_t dirty_dynamic_state,
2304                                bool needs_subpass_resume)
2305 {
2306    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2307 
2308    if (state->meta.subpass_idx != -1) {
2309       state->pass = v3dv_render_pass_from_handle(state->meta.pass);
2310       state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer);
2311 
2312       assert(state->meta.attachment_count <= state->attachment_alloc_count);
2313       const uint32_t attachment_state_item_size =
2314          sizeof(struct v3dv_cmd_buffer_attachment_state);
2315       const uint32_t attachment_state_total_size =
2316          attachment_state_item_size * state->meta.attachment_count;
2317       memcpy(state->attachments, state->meta.attachments,
2318              attachment_state_total_size);
2319 
2320       state->tile_aligned_render_area = state->meta.tile_aligned_render_area;
2321       memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D));
2322 
2323       /* Is needs_subpass_resume is true it means that the emitted the meta
2324        * operation in its own job (possibly with an RT config that is
2325        * incompatible with the current subpass), so resuming subpass execution
2326        * after it requires that we create a new job with the subpass RT setup.
2327        */
2328       if (needs_subpass_resume)
2329          v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->meta.subpass_idx);
2330    } else {
2331       state->subpass_idx = -1;
2332    }
2333 
2334    if (state->meta.gfx.pipeline != NULL) {
2335       struct v3dv_pipeline *pipeline = state->meta.gfx.pipeline;
2336       VkPipelineBindPoint pipeline_binding =
2337          v3dv_pipeline_get_binding_point(pipeline);
2338       v3dv_CmdBindPipeline(v3dv_cmd_buffer_to_handle(cmd_buffer),
2339                            pipeline_binding,
2340                            v3dv_pipeline_to_handle(state->meta.gfx.pipeline));
2341    } else {
2342       state->gfx.pipeline = NULL;
2343    }
2344 
2345    if (dirty_dynamic_state) {
2346       memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
2347       state->dirty |= dirty_dynamic_state;
2348    }
2349 
2350    if (state->meta.has_descriptor_state) {
2351       if (state->meta.gfx.descriptor_state.valid != 0) {
2352          memcpy(&state->gfx.descriptor_state, &state->meta.gfx.descriptor_state,
2353                 sizeof(state->gfx.descriptor_state));
2354       } else {
2355          state->gfx.descriptor_state.valid = 0;
2356       }
2357    }
2358 
2359    memcpy(cmd_buffer->push_constants_data, state->meta.push_constants,
2360           sizeof(state->meta.push_constants));
2361 
2362    state->meta.gfx.pipeline = NULL;
2363    state->meta.framebuffer = VK_NULL_HANDLE;
2364    state->meta.pass = VK_NULL_HANDLE;
2365    state->meta.subpass_idx = -1;
2366    state->meta.has_descriptor_state = false;
2367 }
2368 
2369 static struct v3dv_job *
cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer * cmd_buffer)2370 cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)
2371 {
2372    struct v3dv_job *job = cmd_buffer->state.job;
2373    assert(job);
2374 
2375    /* If the job has been flagged with 'always_flush' and it has already
2376     * recorded any draw calls then we need to start a new job for it.
2377     */
2378    if (job->always_flush && job->draw_count > 0) {
2379       assert(cmd_buffer->state.pass);
2380       /* First, flag the current job as not being the last in the
2381        * current subpass
2382        */
2383       job->is_subpass_finish = false;
2384 
2385       /* Now start a new job in the same subpass and flag it as continuing
2386        * the current subpass.
2387        */
2388       job = v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2389                                            cmd_buffer->state.subpass_idx);
2390       assert(job->draw_count == 0);
2391 
2392       /* Inherit the 'always flush' behavior */
2393       job->always_flush = true;
2394    }
2395 
2396    assert(job->draw_count == 0 || !job->always_flush);
2397    return job;
2398 }
2399 
2400 /**
2401  * The Vulkan spec states:
2402  *
2403  *   "It is legal for a subpass to use no color or depth/stencil
2404  *    attachments (...)  This kind of subpass can use shader side effects such
2405  *    as image stores and atomics to produce an output. In this case, the
2406  *    subpass continues to use the width, height, and layers of the framebuffer
2407  *    to define the dimensions of the rendering area, and the
2408  *    rasterizationSamples from each pipeline’s
2409  *    VkPipelineMultisampleStateCreateInfo to define the number of samples used
2410  *    in rasterization."
2411  *
2412  * We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we
2413  * emit when we start a new frame at the begining of a subpass. At that point,
2414  * if the framebuffer doesn't have any attachments we won't enable MSAA and
2415  * the job won't be valid in the scenario described by the spec.
2416  *
2417  * This function is intended to be called before a draw call and will test if
2418  * we are in that scenario, in which case, it will restart the current job
2419  * with MSAA enabled.
2420  */
2421 static void
cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer * cmd_buffer)2422 cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
2423 {
2424    assert(cmd_buffer->state.job);
2425 
2426    /* We don't support variableMultisampleRate so we know that all pipelines
2427     * bound in the same subpass must have matching number of samples, so we
2428     * can do this check only on the first draw call.
2429     */
2430    if (cmd_buffer->state.job->draw_count > 0)
2431       return;
2432 
2433    /* We only need to restart the frame if the pipeline requires MSAA but
2434     * our frame tiling didn't enable it.
2435     */
2436    if (!cmd_buffer->state.gfx.pipeline->msaa ||
2437        cmd_buffer->state.job->frame_tiling.msaa) {
2438       return;
2439    }
2440 
2441    /* FIXME: Secondary command buffers don't start frames. Instead, they are
2442     * recorded into primary jobs that start them. For secondaries, we should
2443     * still handle this scenario, but we should do that when we record them
2444     * into primaries by testing if any of the secondaries has multisampled
2445     * draw calls in them, and then using that info to decide if we need to
2446     * restart the primary job into which they are being recorded.
2447     */
2448    if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
2449       return;
2450 
2451    /* Drop the current job and restart it with MSAA enabled */
2452    struct v3dv_job *old_job = cmd_buffer->state.job;
2453    cmd_buffer->state.job = NULL;
2454 
2455    struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
2456                                     sizeof(struct v3dv_job), 8,
2457                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2458    if (!job) {
2459       v3dv_flag_oom(cmd_buffer, NULL);
2460       return;
2461    }
2462 
2463    v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CL, cmd_buffer->device, cmd_buffer,
2464                  cmd_buffer->state.subpass_idx);
2465    cmd_buffer->state.job = job;
2466 
2467    v3dv_job_start_frame(job,
2468                         old_job->frame_tiling.width,
2469                         old_job->frame_tiling.height,
2470                         old_job->frame_tiling.layers,
2471                         true,
2472                         old_job->frame_tiling.render_target_count,
2473                         old_job->frame_tiling.internal_bpp,
2474                         true /* msaa */);
2475 
2476    v3dv_job_destroy(old_job);
2477 }
2478 
2479 void
v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer * cmd_buffer)2480 v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
2481 {
2482    assert(cmd_buffer->state.gfx.pipeline);
2483    assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
2484 
2485    /* If we emitted a pipeline barrier right before this draw we won't have
2486     * an active job. In that case, create a new job continuing the current
2487     * subpass.
2488     */
2489    if (!cmd_buffer->state.job) {
2490       v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2491                                      cmd_buffer->state.subpass_idx);
2492    }
2493 
2494    /* Restart single sample job for MSAA pipeline if needed */
2495    cmd_buffer_restart_job_for_msaa_if_needed(cmd_buffer);
2496 
2497    /* If the job is configured to flush on every draw call we need to create
2498     * a new job now.
2499     */
2500    struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer);
2501    job->draw_count++;
2502 
2503    /* GL shader state binds shaders, uniform and vertex attribute state. The
2504     * compiler injects uniforms to handle some descriptor types (such as
2505     * textures), so we need to regen that when descriptor state changes.
2506     *
2507     * We also need to emit new shader state if we have a dirty viewport since
2508     * that will require that we new uniform state for QUNIFORM_VIEWPORT_*.
2509     */
2510    uint32_t *dirty = &cmd_buffer->state.dirty;
2511 
2512    const uint32_t dirty_uniform_state =
2513       *dirty & (V3DV_CMD_DIRTY_PIPELINE |
2514                 V3DV_CMD_DIRTY_PUSH_CONSTANTS |
2515                 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
2516                 V3DV_CMD_DIRTY_VIEWPORT |
2517                 V3DV_CMD_DIRTY_VIEW_INDEX);
2518 
2519    if (dirty_uniform_state)
2520       update_gfx_uniform_state(cmd_buffer, dirty_uniform_state);
2521 
2522    struct v3dv_device *device = cmd_buffer->device;
2523 
2524    if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER))
2525       v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer);
2526 
2527    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
2528       v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer);
2529       v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer);
2530    }
2531 
2532    if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) {
2533       emit_scissor(cmd_buffer);
2534    }
2535 
2536    if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) {
2537       v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer);
2538    }
2539 
2540    if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)
2541       v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer);
2542 
2543    const uint32_t dynamic_stencil_dirty_flags =
2544       V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
2545       V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
2546       V3DV_CMD_DIRTY_STENCIL_REFERENCE;
2547    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags))
2548       v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer);
2549 
2550    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
2551       v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
2552 
2553    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
2554       v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
2555 
2556    if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
2557       v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer);
2558 
2559    if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH)
2560       v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer);
2561 
2562    if (*dirty & V3DV_CMD_DIRTY_PIPELINE)
2563       v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer);
2564 
2565    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE))
2566       v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer);
2567 
2568    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;
2569 }
2570 
2571 static inline void
cmd_buffer_set_view_index(struct v3dv_cmd_buffer * cmd_buffer,uint32_t view_index)2572 cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer,
2573                           uint32_t view_index)
2574 {
2575    cmd_buffer->state.view_index = view_index;
2576    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX;
2577 }
2578 
2579 static void
cmd_buffer_draw(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_draw_info * info)2580 cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
2581                 struct v3dv_draw_info *info)
2582 {
2583 
2584    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
2585    if (likely(!pass->multiview_enabled)) {
2586       v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
2587       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
2588       return;
2589    }
2590 
2591    uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
2592    while (view_mask) {
2593       cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
2594       v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
2595       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
2596    }
2597 }
2598 
2599 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)2600 v3dv_CmdDraw(VkCommandBuffer commandBuffer,
2601              uint32_t vertexCount,
2602              uint32_t instanceCount,
2603              uint32_t firstVertex,
2604              uint32_t firstInstance)
2605 {
2606    if (vertexCount == 0 || instanceCount == 0)
2607       return;
2608 
2609    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2610    struct v3dv_draw_info info = {};
2611    info.vertex_count = vertexCount;
2612    info.instance_count = instanceCount;
2613    info.first_instance = firstInstance;
2614    info.first_vertex = firstVertex;
2615 
2616    cmd_buffer_draw(cmd_buffer, &info);
2617 }
2618 
2619 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)2620 v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
2621                     uint32_t indexCount,
2622                     uint32_t instanceCount,
2623                     uint32_t firstIndex,
2624                     int32_t vertexOffset,
2625                     uint32_t firstInstance)
2626 {
2627    if (indexCount == 0 || instanceCount == 0)
2628       return;
2629 
2630    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2631 
2632    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
2633    if (likely(!pass->multiview_enabled)) {
2634       v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
2635       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
2636          (cmd_buffer, indexCount, instanceCount,
2637           firstIndex, vertexOffset, firstInstance);
2638       return;
2639    }
2640 
2641    uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
2642    while (view_mask) {
2643       cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
2644       v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
2645       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
2646          (cmd_buffer, indexCount, instanceCount,
2647           firstIndex, vertexOffset, firstInstance);
2648    }
2649 }
2650 
2651 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)2652 v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
2653                      VkBuffer _buffer,
2654                      VkDeviceSize offset,
2655                      uint32_t drawCount,
2656                      uint32_t stride)
2657 {
2658    /* drawCount is the number of draws to execute, and can be zero. */
2659    if (drawCount == 0)
2660       return;
2661 
2662    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2663    V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
2664 
2665    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
2666    if (likely(!pass->multiview_enabled)) {
2667       v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
2668       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
2669          (cmd_buffer, buffer, offset, drawCount, stride);
2670       return;
2671    }
2672 
2673    uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
2674    while (view_mask) {
2675       cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
2676       v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
2677       v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
2678          (cmd_buffer, buffer, offset, drawCount, stride);
2679    }
2680 }
2681 
2682 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)2683 v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
2684                             VkBuffer _buffer,
2685                             VkDeviceSize offset,
2686                             uint32_t drawCount,
2687                             uint32_t stride)
2688 {
2689    /* drawCount is the number of draws to execute, and can be zero. */
2690    if (drawCount == 0)
2691       return;
2692 
2693    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2694    V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
2695 
2696    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
2697    if (likely(!pass->multiview_enabled)) {
2698       v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
2699       v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
2700          (cmd_buffer, buffer, offset, drawCount, stride);
2701       return;
2702    }
2703 
2704    uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
2705    while (view_mask) {
2706       cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
2707       v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
2708       v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
2709          (cmd_buffer, buffer, offset, drawCount, stride);
2710    }
2711 }
2712 
2713 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags dstStageMask,VkDependencyFlags dependencyFlags,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferBarrierCount,const VkBufferMemoryBarrier * pBufferBarriers,uint32_t imageBarrierCount,const VkImageMemoryBarrier * pImageBarriers)2714 v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
2715                         VkPipelineStageFlags srcStageMask,
2716                         VkPipelineStageFlags dstStageMask,
2717                         VkDependencyFlags dependencyFlags,
2718                         uint32_t memoryBarrierCount,
2719                         const VkMemoryBarrier *pMemoryBarriers,
2720                         uint32_t bufferBarrierCount,
2721                         const VkBufferMemoryBarrier *pBufferBarriers,
2722                         uint32_t imageBarrierCount,
2723                         const VkImageMemoryBarrier *pImageBarriers)
2724 {
2725    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2726 
2727    /* We only care about barriers between GPU jobs */
2728    if (srcStageMask == VK_PIPELINE_STAGE_HOST_BIT ||
2729        dstStageMask == VK_PIPELINE_STAGE_HOST_BIT) {
2730       return;
2731    }
2732 
2733    /* If we have a recording job, finish it here */
2734    struct v3dv_job *job = cmd_buffer->state.job;
2735    if (job)
2736       v3dv_cmd_buffer_finish_job(cmd_buffer);
2737 
2738    cmd_buffer->state.has_barrier = true;
2739    if (dstStageMask & (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
2740                        VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
2741                        VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
2742                        VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
2743                        VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
2744                        VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT)) {
2745       cmd_buffer->state.has_bcl_barrier = true;
2746    }
2747 }
2748 
2749 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)2750 v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2751                           uint32_t firstBinding,
2752                           uint32_t bindingCount,
2753                           const VkBuffer *pBuffers,
2754                           const VkDeviceSize *pOffsets)
2755 {
2756    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2757    struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
2758 
2759    /* We have to defer setting up vertex buffer since we need the buffer
2760     * stride from the pipeline.
2761     */
2762 
2763    assert(firstBinding + bindingCount <= MAX_VBS);
2764    bool vb_state_changed = false;
2765    for (uint32_t i = 0; i < bindingCount; i++) {
2766       if (vb[firstBinding + i].buffer != v3dv_buffer_from_handle(pBuffers[i])) {
2767          vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]);
2768          vb_state_changed = true;
2769       }
2770       if (vb[firstBinding + i].offset != pOffsets[i]) {
2771          vb[firstBinding + i].offset = pOffsets[i];
2772          vb_state_changed = true;
2773       }
2774    }
2775 
2776    if (vb_state_changed)
2777       cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER;
2778 }
2779 
2780 static uint32_t
get_index_size(VkIndexType index_type)2781 get_index_size(VkIndexType index_type)
2782 {
2783    switch (index_type) {
2784    case VK_INDEX_TYPE_UINT8_EXT:
2785       return 1;
2786       break;
2787    case VK_INDEX_TYPE_UINT16:
2788       return 2;
2789       break;
2790    case VK_INDEX_TYPE_UINT32:
2791       return 4;
2792       break;
2793    default:
2794       unreachable("Unsupported index type");
2795    }
2796 }
2797 
2798 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)2799 v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2800                         VkBuffer buffer,
2801                         VkDeviceSize offset,
2802                         VkIndexType indexType)
2803 {
2804    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2805 
2806    const uint32_t index_size = get_index_size(indexType);
2807    if (buffer == cmd_buffer->state.index_buffer.buffer &&
2808        offset == cmd_buffer->state.index_buffer.offset &&
2809        index_size == cmd_buffer->state.index_buffer.index_size) {
2810       return;
2811    }
2812 
2813    cmd_buffer->state.index_buffer.buffer = buffer;
2814    cmd_buffer->state.index_buffer.offset = offset;
2815    cmd_buffer->state.index_buffer.index_size = index_size;
2816    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_INDEX_BUFFER;
2817 }
2818 
2819 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)2820 v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2821                               VkStencilFaceFlags faceMask,
2822                               uint32_t compareMask)
2823 {
2824    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2825 
2826    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2827       cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask & 0xff;
2828    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2829       cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask & 0xff;
2830 
2831    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
2832 }
2833 
2834 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)2835 v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2836                             VkStencilFaceFlags faceMask,
2837                             uint32_t writeMask)
2838 {
2839    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2840 
2841    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2842       cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask & 0xff;
2843    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2844       cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask & 0xff;
2845 
2846    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
2847 }
2848 
2849 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)2850 v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2851                             VkStencilFaceFlags faceMask,
2852                             uint32_t reference)
2853 {
2854    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2855 
2856    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2857       cmd_buffer->state.dynamic.stencil_reference.front = reference & 0xff;
2858    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2859       cmd_buffer->state.dynamic.stencil_reference.back = reference & 0xff;
2860 
2861    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
2862 }
2863 
2864 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,float depthBiasConstantFactor,float depthBiasClamp,float depthBiasSlopeFactor)2865 v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2866                      float depthBiasConstantFactor,
2867                      float depthBiasClamp,
2868                      float depthBiasSlopeFactor)
2869 {
2870    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2871 
2872    cmd_buffer->state.dynamic.depth_bias.constant_factor = depthBiasConstantFactor;
2873    cmd_buffer->state.dynamic.depth_bias.depth_bias_clamp = depthBiasClamp;
2874    cmd_buffer->state.dynamic.depth_bias.slope_factor = depthBiasSlopeFactor;
2875    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
2876 }
2877 
2878 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)2879 v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2880                        float minDepthBounds,
2881                        float maxDepthBounds)
2882 {
2883    /* We do not support depth bounds testing so we just ingore this. We are
2884     * already asserting that pipelines don't enable the feature anyway.
2885     */
2886 }
2887 
2888 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)2889 v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,
2890                      float lineWidth)
2891 {
2892    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2893 
2894    cmd_buffer->state.dynamic.line_width = lineWidth;
2895    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
2896 }
2897 
2898 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)2899 v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2900                            VkPipelineBindPoint pipelineBindPoint,
2901                            VkPipelineLayout _layout,
2902                            uint32_t firstSet,
2903                            uint32_t descriptorSetCount,
2904                            const VkDescriptorSet *pDescriptorSets,
2905                            uint32_t dynamicOffsetCount,
2906                            const uint32_t *pDynamicOffsets)
2907 {
2908    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2909    V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, _layout);
2910 
2911    uint32_t dyn_index = 0;
2912 
2913    assert(firstSet + descriptorSetCount <= MAX_SETS);
2914 
2915    struct v3dv_descriptor_state *descriptor_state =
2916       pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE ?
2917       &cmd_buffer->state.compute.descriptor_state :
2918       &cmd_buffer->state.gfx.descriptor_state;
2919 
2920    VkShaderStageFlags dirty_stages = 0;
2921    bool descriptor_state_changed = false;
2922    for (uint32_t i = 0; i < descriptorSetCount; i++) {
2923       V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]);
2924       uint32_t index = firstSet + i;
2925 
2926       descriptor_state->valid |= (1u << index);
2927       if (descriptor_state->descriptor_sets[index] != set) {
2928          descriptor_state->descriptor_sets[index] = set;
2929          dirty_stages |= set->layout->shader_stages;
2930          descriptor_state_changed = true;
2931       }
2932 
2933       for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) {
2934          uint32_t idx = j + layout->set[i + firstSet].dynamic_offset_start;
2935 
2936          if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) {
2937             descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index];
2938             dirty_stages |= set->layout->shader_stages;
2939             descriptor_state_changed = true;
2940          }
2941       }
2942    }
2943 
2944    if (descriptor_state_changed) {
2945       if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2946          cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
2947          cmd_buffer->state.dirty_descriptor_stages |= dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
2948       } else {
2949          cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
2950          cmd_buffer->state.dirty_descriptor_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
2951       }
2952    }
2953 }
2954 
2955 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)2956 v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
2957                       VkPipelineLayout layout,
2958                       VkShaderStageFlags stageFlags,
2959                       uint32_t offset,
2960                       uint32_t size,
2961                       const void *pValues)
2962 {
2963    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2964 
2965    if (!memcmp((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size))
2966       return;
2967 
2968    memcpy((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size);
2969 
2970    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS;
2971    cmd_buffer->state.dirty_push_constants_stages |= stageFlags;
2972 }
2973 
2974 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])2975 v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2976                           const float blendConstants[4])
2977 {
2978    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2979    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2980 
2981    if (!memcmp(state->dynamic.blend_constants, blendConstants,
2982                sizeof(state->dynamic.blend_constants))) {
2983       return;
2984    }
2985 
2986    memcpy(state->dynamic.blend_constants, blendConstants,
2987           sizeof(state->dynamic.blend_constants));
2988 
2989    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
2990 }
2991 
2992 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)2993 v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
2994                                uint32_t attachmentCount,
2995                                const VkBool32 *pColorWriteEnables)
2996 {
2997    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2998    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2999    uint32_t color_write_enable = 0;
3000 
3001    for (uint32_t i = 0; i < attachmentCount; i++)
3002       color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
3003 
3004    if (state->dynamic.color_write_enable == color_write_enable)
3005       return;
3006 
3007    state->dynamic.color_write_enable = color_write_enable;
3008 
3009    state->dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
3010 }
3011 
3012 void
v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)3013 v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
3014                               struct v3dv_query_pool *pool,
3015                               uint32_t first,
3016                               uint32_t count)
3017 {
3018    /* Resets can only happen outside a render pass instance so we should not
3019     * be in the middle of job recording.
3020     */
3021    assert(cmd_buffer->state.pass == NULL);
3022    assert(cmd_buffer->state.job == NULL);
3023 
3024    assert(first < pool->query_count);
3025    assert(first + count <= pool->query_count);
3026 
3027    struct v3dv_job *job =
3028       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
3029                                      V3DV_JOB_TYPE_CPU_RESET_QUERIES,
3030                                      cmd_buffer, -1);
3031    v3dv_return_if_oom(cmd_buffer, NULL);
3032 
3033    job->cpu.query_reset.pool = pool;
3034    job->cpu.query_reset.first = first;
3035    job->cpu.query_reset.count = count;
3036 
3037    list_addtail(&job->list_link, &cmd_buffer->jobs);
3038 }
3039 
3040 void
v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer * cmd_buffer,uint32_t slot_size,uint32_t used_count,uint32_t * alloc_count,void ** ptr)3041 v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
3042                                    uint32_t slot_size,
3043                                    uint32_t used_count,
3044                                    uint32_t *alloc_count,
3045                                    void **ptr)
3046 {
3047    if (used_count >= *alloc_count) {
3048       const uint32_t prev_slot_count = *alloc_count;
3049       void *old_buffer = *ptr;
3050 
3051       const uint32_t new_slot_count = MAX2(*alloc_count * 2, 4);
3052       const uint32_t bytes = new_slot_count * slot_size;
3053       *ptr = vk_alloc(&cmd_buffer->device->vk.alloc, bytes, 8,
3054                       VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3055       if (*ptr == NULL) {
3056          fprintf(stderr, "Error: failed to allocate CPU buffer for query.\n");
3057          v3dv_flag_oom(cmd_buffer, NULL);
3058          return;
3059       }
3060 
3061       memcpy(*ptr, old_buffer, prev_slot_count * slot_size);
3062       *alloc_count = new_slot_count;
3063    }
3064    assert(used_count < *alloc_count);
3065 }
3066 
3067 void
v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,VkQueryControlFlags flags)3068 v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
3069                             struct v3dv_query_pool *pool,
3070                             uint32_t query,
3071                             VkQueryControlFlags flags)
3072 {
3073    /* FIXME: we only support one active query for now */
3074    assert(cmd_buffer->state.query.active_query.bo == NULL);
3075    assert(query < pool->query_count);
3076 
3077    cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
3078    cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
3079    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
3080 }
3081 
3082 void
v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)3083 v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
3084                           struct v3dv_query_pool *pool,
3085                           uint32_t query)
3086 {
3087    assert(query < pool->query_count);
3088    assert(cmd_buffer->state.query.active_query.bo != NULL);
3089 
3090    if  (cmd_buffer->state.pass) {
3091       /* Queue the EndQuery in the command buffer state, we will create a CPU
3092        * job to flag all of these queries as possibly available right after the
3093        * render pass job in which they have been recorded.
3094        */
3095       struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3096       v3dv_cmd_buffer_ensure_array_state(cmd_buffer,
3097                                          sizeof(struct v3dv_end_query_cpu_job_info),
3098                                          state->query.end.used_count,
3099                                          &state->query.end.alloc_count,
3100                                          (void **) &state->query.end.states);
3101       v3dv_return_if_oom(cmd_buffer, NULL);
3102 
3103       struct v3dv_end_query_cpu_job_info *info =
3104          &state->query.end.states[state->query.end.used_count++];
3105 
3106       info->pool = pool;
3107       info->query = query;
3108 
3109       /* From the Vulkan spec:
3110        *
3111        *   "If queries are used while executing a render pass instance that has
3112        *    multiview enabled, the query uses N consecutive query indices in
3113        *    the query pool (starting at query) where N is the number of bits set
3114        *    in the view mask in the subpass the query is used in. How the
3115        *    numerical results of the query are distributed among the queries is
3116        *    implementation-dependent."
3117        *
3118        * In our case, only the first query is used but this means we still need
3119        * to flag the other queries as available so we don't emit errors when
3120        * the applications attempt to retrive values from them.
3121        */
3122       struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3123       if (!pass->multiview_enabled) {
3124          info->count = 1;
3125       } else {
3126          struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
3127          info->count = util_bitcount(subpass->view_mask);
3128       }
3129    } else {
3130       /* Otherwise, schedule the CPU job immediately */
3131       struct v3dv_job *job =
3132          v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
3133                                         V3DV_JOB_TYPE_CPU_END_QUERY,
3134                                         cmd_buffer, -1);
3135       v3dv_return_if_oom(cmd_buffer, NULL);
3136 
3137       job->cpu.query_end.pool = pool;
3138       job->cpu.query_end.query = query;
3139 
3140       /* Multiview queries cannot cross subpass boundaries */
3141       job->cpu.query_end.count = 1;
3142 
3143       list_addtail(&job->list_link, &cmd_buffer->jobs);
3144    }
3145 
3146    cmd_buffer->state.query.active_query.bo = NULL;
3147    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
3148 }
3149 
3150 void
v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,struct v3dv_buffer * dst,uint32_t offset,uint32_t stride,VkQueryResultFlags flags)3151 v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
3152                                    struct v3dv_query_pool *pool,
3153                                    uint32_t first,
3154                                    uint32_t count,
3155                                    struct v3dv_buffer *dst,
3156                                    uint32_t offset,
3157                                    uint32_t stride,
3158                                    VkQueryResultFlags flags)
3159 {
3160    /* Copies can only happen outside a render pass instance so we should not
3161     * be in the middle of job recording.
3162     */
3163    assert(cmd_buffer->state.pass == NULL);
3164    assert(cmd_buffer->state.job == NULL);
3165 
3166    assert(first < pool->query_count);
3167    assert(first + count <= pool->query_count);
3168 
3169    struct v3dv_job *job =
3170       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
3171                                      V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
3172                                      cmd_buffer, -1);
3173    v3dv_return_if_oom(cmd_buffer, NULL);
3174 
3175    job->cpu.query_copy_results.pool = pool;
3176    job->cpu.query_copy_results.first = first;
3177    job->cpu.query_copy_results.count = count;
3178    job->cpu.query_copy_results.dst = dst;
3179    job->cpu.query_copy_results.offset = offset;
3180    job->cpu.query_copy_results.stride = stride;
3181    job->cpu.query_copy_results.flags = flags;
3182 
3183    list_addtail(&job->list_link, &cmd_buffer->jobs);
3184 }
3185 
3186 void
v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer * cmd_buffer,struct drm_v3d_submit_tfu * tfu)3187 v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
3188                             struct drm_v3d_submit_tfu *tfu)
3189 {
3190    struct v3dv_device *device = cmd_buffer->device;
3191    struct v3dv_job *job = vk_zalloc(&device->vk.alloc,
3192                                     sizeof(struct v3dv_job), 8,
3193                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3194    if (!job) {
3195       v3dv_flag_oom(cmd_buffer, NULL);
3196       return;
3197    }
3198 
3199    v3dv_job_init(job, V3DV_JOB_TYPE_GPU_TFU, device, cmd_buffer, -1);
3200    job->tfu = *tfu;
3201    list_addtail(&job->list_link, &cmd_buffer->jobs);
3202 }
3203 
3204 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags stageMask)3205 v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
3206                  VkEvent _event,
3207                  VkPipelineStageFlags stageMask)
3208 {
3209    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3210    V3DV_FROM_HANDLE(v3dv_event, event, _event);
3211 
3212    /* Event (re)sets can only happen outside a render pass instance so we
3213     * should not be in the middle of job recording.
3214     */
3215    assert(cmd_buffer->state.pass == NULL);
3216    assert(cmd_buffer->state.job == NULL);
3217 
3218    struct v3dv_job *job =
3219       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
3220                                      V3DV_JOB_TYPE_CPU_SET_EVENT,
3221                                      cmd_buffer, -1);
3222    v3dv_return_if_oom(cmd_buffer, NULL);
3223 
3224    job->cpu.event_set.event = event;
3225    job->cpu.event_set.state = 1;
3226 
3227    list_addtail(&job->list_link, &cmd_buffer->jobs);
3228 }
3229 
3230 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags stageMask)3231 v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
3232                    VkEvent _event,
3233                    VkPipelineStageFlags stageMask)
3234 {
3235    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3236    V3DV_FROM_HANDLE(v3dv_event, event, _event);
3237 
3238    /* Event (re)sets can only happen outside a render pass instance so we
3239     * should not be in the middle of job recording.
3240     */
3241    assert(cmd_buffer->state.pass == NULL);
3242    assert(cmd_buffer->state.job == NULL);
3243 
3244    struct v3dv_job *job =
3245       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
3246                                      V3DV_JOB_TYPE_CPU_SET_EVENT,
3247                                      cmd_buffer, -1);
3248    v3dv_return_if_oom(cmd_buffer, NULL);
3249 
3250    job->cpu.event_set.event = event;
3251    job->cpu.event_set.state = 0;
3252 
3253    list_addtail(&job->list_link, &cmd_buffer->jobs);
3254 }
3255 
3256 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags dstStageMask,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers)3257 v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
3258                    uint32_t eventCount,
3259                    const VkEvent *pEvents,
3260                    VkPipelineStageFlags srcStageMask,
3261                    VkPipelineStageFlags dstStageMask,
3262                    uint32_t memoryBarrierCount,
3263                    const VkMemoryBarrier *pMemoryBarriers,
3264                    uint32_t bufferMemoryBarrierCount,
3265                    const VkBufferMemoryBarrier *pBufferMemoryBarriers,
3266                    uint32_t imageMemoryBarrierCount,
3267                    const VkImageMemoryBarrier *pImageMemoryBarriers)
3268 {
3269    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3270 
3271    assert(eventCount > 0);
3272 
3273    struct v3dv_job *job =
3274       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
3275                                      V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
3276                                      cmd_buffer, -1);
3277    v3dv_return_if_oom(cmd_buffer, NULL);
3278 
3279    const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount;
3280 
3281    job->cpu.event_wait.events =
3282       vk_alloc(&cmd_buffer->device->vk.alloc, event_list_size, 8,
3283                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3284    if (!job->cpu.event_wait.events) {
3285       v3dv_flag_oom(cmd_buffer, NULL);
3286       return;
3287    }
3288    job->cpu.event_wait.event_count = eventCount;
3289 
3290    for (uint32_t i = 0; i < eventCount; i++)
3291       job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]);
3292 
3293    /* vkCmdWaitEvents can be recorded inside a render pass, so we might have
3294     * an active job.
3295     *
3296     * If we are inside a render pass, because we vkCmd(Re)SetEvent can't happen
3297     * inside a render pass, it is safe to move the wait job so it happens right
3298     * before the current job we are currently recording for the subpass, if any
3299     * (it would actually be safe to move it all the way back to right before
3300     * the start of the render pass).
3301     *
3302     * If we are outside a render pass then we should not have any on-going job
3303     * and we are free to just add the wait job without restrictions.
3304     */
3305    assert(cmd_buffer->state.pass || !cmd_buffer->state.job);
3306    list_addtail(&job->list_link, &cmd_buffer->jobs);
3307 }
3308 
3309 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits pipelineStage,VkQueryPool queryPool,uint32_t query)3310 v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
3311                        VkPipelineStageFlagBits pipelineStage,
3312                        VkQueryPool queryPool,
3313                        uint32_t query)
3314 {
3315    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3316    V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool);
3317 
3318    /* If this is called inside a render pass we need to finish the current
3319     * job here...
3320     */
3321    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3322    if (pass)
3323       v3dv_cmd_buffer_finish_job(cmd_buffer);
3324 
3325    struct v3dv_job *job =
3326       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
3327                                      V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
3328                                      cmd_buffer, -1);
3329    v3dv_return_if_oom(cmd_buffer, NULL);
3330 
3331    job->cpu.query_timestamp.pool = query_pool;
3332    job->cpu.query_timestamp.query = query;
3333 
3334    if (!pass || !pass->multiview_enabled) {
3335       job->cpu.query_timestamp.count = 1;
3336    } else {
3337       struct v3dv_subpass *subpass =
3338          &pass->subpasses[cmd_buffer->state.subpass_idx];
3339       job->cpu.query_timestamp.count = util_bitcount(subpass->view_mask);
3340    }
3341 
3342    list_addtail(&job->list_link, &cmd_buffer->jobs);
3343    cmd_buffer->state.job = NULL;
3344 
3345    /* ...and resume the subpass after the timestamp */
3346    if (cmd_buffer->state.pass)
3347       v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
3348 }
3349 
3350 static void
cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer * cmd_buffer)3351 cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
3352 {
3353    assert(cmd_buffer->state.compute.pipeline);
3354    assert(cmd_buffer->state.compute.pipeline->active_stages ==
3355           VK_SHADER_STAGE_COMPUTE_BIT);
3356 
3357    cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE |
3358                                 V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
3359    cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
3360    cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
3361 }
3362 
3363 #define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
3364 #define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
3365 /* Allow this dispatch to start while the last one is still running. */
3366 #define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
3367 /* Maximum supergroup ID.  6 bits. */
3368 #define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
3369 /* Batches per supergroup minus 1.  8 bits. */
3370 #define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
3371 /* Workgroups per supergroup, 0 means 16 */
3372 #define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
3373 #define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
3374 
3375 #define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
3376 #define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
3377 #define V3D_CSD_CFG5_THREADING (1 << 0)
3378 
3379 void
v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info * info,const uint32_t * wg_counts)3380 v3dv_cmd_buffer_rewrite_indirect_csd_job(
3381    struct v3dv_csd_indirect_cpu_job_info *info,
3382    const uint32_t *wg_counts)
3383 {
3384    assert(info->csd_job);
3385    struct v3dv_job *job = info->csd_job;
3386 
3387    assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
3388    assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0);
3389 
3390    struct drm_v3d_submit_csd *submit = &job->csd.submit;
3391 
3392    job->csd.wg_count[0] = wg_counts[0];
3393    job->csd.wg_count[1] = wg_counts[1];
3394    job->csd.wg_count[2] = wg_counts[2];
3395 
3396    submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
3397    submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
3398    submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
3399 
3400    submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
3401                     (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
3402    assert(submit->cfg[4] != ~0);
3403 
3404    if (info->needs_wg_uniform_rewrite) {
3405       /* Make sure the GPU is not currently accessing the indirect CL for this
3406        * job, since we are about to overwrite some of the uniform data.
3407        */
3408       v3dv_bo_wait(job->device, job->indirect.bo, PIPE_TIMEOUT_INFINITE);
3409 
3410       for (uint32_t i = 0; i < 3; i++) {
3411          if (info->wg_uniform_offsets[i]) {
3412             /* Sanity check that our uniform pointers are within the allocated
3413              * BO space for our indirect CL.
3414              */
3415             assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base);
3416             assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next);
3417             *(info->wg_uniform_offsets[i]) = wg_counts[i];
3418          }
3419       }
3420    }
3421 }
3422 
3423 static struct v3dv_job *
cmd_buffer_create_csd_job(struct v3dv_cmd_buffer * cmd_buffer,uint32_t base_offset_x,uint32_t base_offset_y,uint32_t base_offset_z,uint32_t group_count_x,uint32_t group_count_y,uint32_t group_count_z,uint32_t ** wg_uniform_offsets_out,uint32_t * wg_size_out)3424 cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
3425                           uint32_t base_offset_x,
3426                           uint32_t base_offset_y,
3427                           uint32_t base_offset_z,
3428                           uint32_t group_count_x,
3429                           uint32_t group_count_y,
3430                           uint32_t group_count_z,
3431                           uint32_t **wg_uniform_offsets_out,
3432                           uint32_t *wg_size_out)
3433 {
3434    struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
3435    assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
3436    struct v3dv_shader_variant *cs_variant =
3437       pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE];
3438 
3439    struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
3440                                     sizeof(struct v3dv_job), 8,
3441                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3442    if (!job) {
3443       v3dv_flag_oom(cmd_buffer, NULL);
3444       return NULL;
3445    }
3446 
3447    v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
3448    cmd_buffer->state.job = job;
3449 
3450    struct drm_v3d_submit_csd *submit = &job->csd.submit;
3451 
3452    job->csd.wg_count[0] = group_count_x;
3453    job->csd.wg_count[1] = group_count_y;
3454    job->csd.wg_count[2] = group_count_z;
3455 
3456    job->csd.wg_base[0] = base_offset_x;
3457    job->csd.wg_base[1] = base_offset_y;
3458    job->csd.wg_base[2] = base_offset_z;
3459 
3460    submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
3461    submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
3462    submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
3463 
3464    const struct v3d_compute_prog_data *cpd =
3465       cs_variant->prog_data.cs;
3466 
3467    const uint32_t num_wgs = group_count_x * group_count_y * group_count_z;
3468    const uint32_t wg_size = cpd->local_size[0] *
3469                             cpd->local_size[1] *
3470                             cpd->local_size[2];
3471 
3472    uint32_t wgs_per_sg =
3473       v3d_csd_choose_workgroups_per_supergroup(
3474          &cmd_buffer->device->devinfo,
3475          cs_variant->prog_data.cs->has_subgroups,
3476          cs_variant->prog_data.cs->base.has_control_barrier,
3477          cs_variant->prog_data.cs->base.threads,
3478          num_wgs, wg_size);
3479 
3480    uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16);
3481    uint32_t whole_sgs = num_wgs / wgs_per_sg;
3482    uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;
3483    uint32_t num_batches = batches_per_sg * whole_sgs +
3484                           DIV_ROUND_UP(rem_wgs * wg_size, 16);
3485 
3486    submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
3487    submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT;
3488    submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
3489    if (wg_size_out)
3490       *wg_size_out = wg_size;
3491 
3492    submit->cfg[4] = num_batches - 1;
3493    assert(submit->cfg[4] != ~0);
3494 
3495    assert(pipeline->shared_data->assembly_bo);
3496    struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
3497 
3498    submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
3499    submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
3500    if (cs_variant->prog_data.base->single_seg)
3501       submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
3502    if (cs_variant->prog_data.base->threads == 4)
3503       submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
3504 
3505    if (cs_variant->prog_data.cs->shared_size > 0) {
3506       job->csd.shared_memory =
3507          v3dv_bo_alloc(cmd_buffer->device,
3508                        cs_variant->prog_data.cs->shared_size * wgs_per_sg,
3509                        "shared_vars", true);
3510       if (!job->csd.shared_memory) {
3511          v3dv_flag_oom(cmd_buffer, NULL);
3512          return job;
3513       }
3514    }
3515 
3516    v3dv_job_add_bo_unchecked(job, cs_assembly_bo);
3517    struct v3dv_cl_reloc uniforms =
3518       v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline,
3519                                      cs_variant,
3520                                      wg_uniform_offsets_out);
3521    submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
3522 
3523    v3dv_job_add_bo(job, uniforms.bo);
3524 
3525    return job;
3526 }
3527 
3528 static void
cmd_buffer_dispatch(struct v3dv_cmd_buffer * cmd_buffer,uint32_t base_offset_x,uint32_t base_offset_y,uint32_t base_offset_z,uint32_t group_count_x,uint32_t group_count_y,uint32_t group_count_z)3529 cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
3530                     uint32_t base_offset_x,
3531                     uint32_t base_offset_y,
3532                     uint32_t base_offset_z,
3533                     uint32_t group_count_x,
3534                     uint32_t group_count_y,
3535                     uint32_t group_count_z)
3536 {
3537    if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
3538       return;
3539 
3540    struct v3dv_job *job =
3541       cmd_buffer_create_csd_job(cmd_buffer,
3542                                 base_offset_x,
3543                                 base_offset_y,
3544                                 base_offset_z,
3545                                 group_count_x,
3546                                 group_count_y,
3547                                 group_count_z,
3548                                 NULL, NULL);
3549 
3550    list_addtail(&job->list_link, &cmd_buffer->jobs);
3551    cmd_buffer->state.job = NULL;
3552 }
3553 
3554 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)3555 v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
3556                  uint32_t groupCountX,
3557                  uint32_t groupCountY,
3558                  uint32_t groupCountZ)
3559 {
3560    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3561 
3562    cmd_buffer_emit_pre_dispatch(cmd_buffer);
3563    cmd_buffer_dispatch(cmd_buffer, 0, 0, 0,
3564                        groupCountX, groupCountY, groupCountZ);
3565 }
3566 
3567 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)3568 v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,
3569                      uint32_t baseGroupX,
3570                      uint32_t baseGroupY,
3571                      uint32_t baseGroupZ,
3572                      uint32_t groupCountX,
3573                      uint32_t groupCountY,
3574                      uint32_t groupCountZ)
3575 {
3576    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3577 
3578    cmd_buffer_emit_pre_dispatch(cmd_buffer);
3579    cmd_buffer_dispatch(cmd_buffer,
3580                        baseGroupX, baseGroupY, baseGroupZ,
3581                        groupCountX, groupCountY, groupCountZ);
3582 }
3583 
3584 
3585 static void
cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,uint32_t offset)3586 cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
3587                              struct v3dv_buffer *buffer,
3588                              uint32_t offset)
3589 {
3590    /* We can't do indirect dispatches, so instead we record a CPU job that,
3591     * when executed in the queue, will map the indirect buffer, read the
3592     * dispatch parameters, and submit a regular dispatch.
3593     */
3594    struct v3dv_job *job =
3595       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
3596                                      V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
3597                                      cmd_buffer, -1);
3598    v3dv_return_if_oom(cmd_buffer, NULL);
3599 
3600    /* We need to create a CSD job now, even if we still don't know the actual
3601     * dispatch parameters, because the job setup needs to be done using the
3602     * current command buffer state (i.e. pipeline, descriptor sets, push
3603     * constants, etc.). So we create the job with default dispatch parameters
3604     * and we will rewrite the parts we need at submit time if the indirect
3605     * parameters don't match the ones we used to setup the job.
3606     */
3607    struct v3dv_job *csd_job =
3608       cmd_buffer_create_csd_job(cmd_buffer,
3609                                 0, 0, 0,
3610                                 1, 1, 1,
3611                                 &job->cpu.csd_indirect.wg_uniform_offsets[0],
3612                                 &job->cpu.csd_indirect.wg_size);
3613    v3dv_return_if_oom(cmd_buffer, NULL);
3614    assert(csd_job);
3615 
3616    job->cpu.csd_indirect.buffer = buffer;
3617    job->cpu.csd_indirect.offset = offset;
3618    job->cpu.csd_indirect.csd_job = csd_job;
3619 
3620    /* If the compute shader reads the workgroup sizes we will also need to
3621     * rewrite the corresponding uniforms.
3622     */
3623    job->cpu.csd_indirect.needs_wg_uniform_rewrite =
3624       job->cpu.csd_indirect.wg_uniform_offsets[0] ||
3625       job->cpu.csd_indirect.wg_uniform_offsets[1] ||
3626       job->cpu.csd_indirect.wg_uniform_offsets[2];
3627 
3628    list_addtail(&job->list_link, &cmd_buffer->jobs);
3629    cmd_buffer->state.job = NULL;
3630 }
3631 
3632 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)3633 v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3634                          VkBuffer _buffer,
3635                          VkDeviceSize offset)
3636 {
3637    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3638    V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
3639 
3640    assert(offset <= UINT32_MAX);
3641 
3642    cmd_buffer_emit_pre_dispatch(cmd_buffer);
3643    cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset);
3644 }
3645 
3646 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,uint32_t deviceMask)3647 v3dv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
3648 {
3649    /* Nothing to do here since we only support a single device */
3650    assert(deviceMask == 0x1);
3651 }
3652