1 /*
2  * Copyright © 2019 Raspberry Pi
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "v3dv_meta_common.h"
26 
27 #include "compiler/nir/nir_builder.h"
28 #include "vk_format_info.h"
29 #include "util/u_pack_color.h"
30 #include "vulkan/util/vk_common_entrypoints.h"
31 
32 static uint32_t
meta_blit_key_hash(const void * key)33 meta_blit_key_hash(const void *key)
34 {
35    return _mesa_hash_data(key, V3DV_META_BLIT_CACHE_KEY_SIZE);
36 }
37 
38 static bool
meta_blit_key_compare(const void * key1,const void * key2)39 meta_blit_key_compare(const void *key1, const void *key2)
40 {
41    return memcmp(key1, key2, V3DV_META_BLIT_CACHE_KEY_SIZE) == 0;
42 }
43 
44 static bool
create_blit_pipeline_layout(struct v3dv_device * device,VkDescriptorSetLayout * descriptor_set_layout,VkPipelineLayout * pipeline_layout)45 create_blit_pipeline_layout(struct v3dv_device *device,
46                             VkDescriptorSetLayout *descriptor_set_layout,
47                             VkPipelineLayout *pipeline_layout)
48 {
49    VkResult result;
50 
51    if (*descriptor_set_layout == 0) {
52       VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
53          .binding = 0,
54          .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
55          .descriptorCount = 1,
56          .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
57       };
58       VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
59          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
60          .bindingCount = 1,
61          .pBindings = &descriptor_set_layout_binding,
62       };
63       result =
64          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
65                                         &descriptor_set_layout_info,
66                                         &device->vk.alloc,
67                                         descriptor_set_layout);
68       if (result != VK_SUCCESS)
69          return false;
70    }
71 
72    assert(*pipeline_layout == 0);
73    VkPipelineLayoutCreateInfo pipeline_layout_info = {
74       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
75       .setLayoutCount = 1,
76       .pSetLayouts = descriptor_set_layout,
77       .pushConstantRangeCount = 1,
78       .pPushConstantRanges =
79          &(VkPushConstantRange) { VK_SHADER_STAGE_VERTEX_BIT, 0, 20 },
80    };
81 
82    result =
83       v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
84                                 &pipeline_layout_info,
85                                 &device->vk.alloc,
86                                 pipeline_layout);
87    return result == VK_SUCCESS;
88 }
89 
90 void
v3dv_meta_blit_init(struct v3dv_device * device)91 v3dv_meta_blit_init(struct v3dv_device *device)
92 {
93    for (uint32_t i = 0; i < 3; i++) {
94       device->meta.blit.cache[i] =
95          _mesa_hash_table_create(NULL,
96                                  meta_blit_key_hash,
97                                  meta_blit_key_compare);
98    }
99 
100    create_blit_pipeline_layout(device,
101                                &device->meta.blit.ds_layout,
102                                &device->meta.blit.p_layout);
103 }
104 
105 void
v3dv_meta_blit_finish(struct v3dv_device * device)106 v3dv_meta_blit_finish(struct v3dv_device *device)
107 {
108    VkDevice _device = v3dv_device_to_handle(device);
109 
110    for (uint32_t i = 0; i < 3; i++) {
111       hash_table_foreach(device->meta.blit.cache[i], entry) {
112          struct v3dv_meta_blit_pipeline *item = entry->data;
113          v3dv_DestroyPipeline(_device, item->pipeline, &device->vk.alloc);
114          v3dv_DestroyRenderPass(_device, item->pass, &device->vk.alloc);
115          v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->vk.alloc);
116          vk_free(&device->vk.alloc, item);
117       }
118       _mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
119    }
120 
121    if (device->meta.blit.p_layout) {
122       v3dv_DestroyPipelineLayout(_device, device->meta.blit.p_layout,
123                                  &device->vk.alloc);
124    }
125 
126    if (device->meta.blit.ds_layout) {
127       v3dv_DestroyDescriptorSetLayout(_device, device->meta.blit.ds_layout,
128                                       &device->vk.alloc);
129    }
130 }
131 
132 static uint32_t
meta_texel_buffer_copy_key_hash(const void * key)133 meta_texel_buffer_copy_key_hash(const void *key)
134 {
135    return _mesa_hash_data(key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
136 }
137 
138 static bool
meta_texel_buffer_copy_key_compare(const void * key1,const void * key2)139 meta_texel_buffer_copy_key_compare(const void *key1, const void *key2)
140 {
141    return memcmp(key1, key2, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE) == 0;
142 }
143 
144 static bool
create_texel_buffer_copy_pipeline_layout(struct v3dv_device * device,VkDescriptorSetLayout * ds_layout,VkPipelineLayout * p_layout)145 create_texel_buffer_copy_pipeline_layout(struct v3dv_device *device,
146                                          VkDescriptorSetLayout *ds_layout,
147                                          VkPipelineLayout *p_layout)
148 {
149    VkResult result;
150 
151    if (*ds_layout == 0) {
152       VkDescriptorSetLayoutBinding ds_layout_binding = {
153          .binding = 0,
154          .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
155          .descriptorCount = 1,
156          .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
157       };
158       VkDescriptorSetLayoutCreateInfo ds_layout_info = {
159          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
160          .bindingCount = 1,
161          .pBindings = &ds_layout_binding,
162       };
163       result =
164          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
165                                         &ds_layout_info,
166                                         &device->vk.alloc,
167                                         ds_layout);
168       if (result != VK_SUCCESS)
169          return false;
170    }
171 
172    assert(*p_layout == 0);
173    /* FIXME: this is abusing a bit the API, since not all of our copy
174     * pipelines have a geometry shader. We could create 2 different pipeline
175     * layouts, but this works for us for now.
176     */
177 #define TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET      0
178 #define TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET  16
179 #define TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET  20
180 #define TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET   24
181    VkPushConstantRange ranges[2] = {
182       { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 24 },
183       { VK_SHADER_STAGE_GEOMETRY_BIT, 24, 4 },
184    };
185 
186    VkPipelineLayoutCreateInfo p_layout_info = {
187       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
188       .setLayoutCount = 1,
189       .pSetLayouts = ds_layout,
190       .pushConstantRangeCount = 2,
191       .pPushConstantRanges = ranges,
192    };
193 
194    result =
195       v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
196                                 &p_layout_info,
197                                 &device->vk.alloc,
198                                 p_layout);
199    return result == VK_SUCCESS;
200 }
201 
202 void
v3dv_meta_texel_buffer_copy_init(struct v3dv_device * device)203 v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device)
204 {
205    for (uint32_t i = 0; i < 3; i++) {
206       device->meta.texel_buffer_copy.cache[i] =
207          _mesa_hash_table_create(NULL,
208                                  meta_texel_buffer_copy_key_hash,
209                                  meta_texel_buffer_copy_key_compare);
210    }
211 
212    create_texel_buffer_copy_pipeline_layout(
213       device,
214       &device->meta.texel_buffer_copy.ds_layout,
215       &device->meta.texel_buffer_copy.p_layout);
216 }
217 
218 void
v3dv_meta_texel_buffer_copy_finish(struct v3dv_device * device)219 v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device)
220 {
221    VkDevice _device = v3dv_device_to_handle(device);
222 
223    for (uint32_t i = 0; i < 3; i++) {
224       hash_table_foreach(device->meta.texel_buffer_copy.cache[i], entry) {
225          struct v3dv_meta_texel_buffer_copy_pipeline *item = entry->data;
226          v3dv_DestroyPipeline(_device, item->pipeline, &device->vk.alloc);
227          v3dv_DestroyRenderPass(_device, item->pass, &device->vk.alloc);
228          v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->vk.alloc);
229          vk_free(&device->vk.alloc, item);
230       }
231       _mesa_hash_table_destroy(device->meta.texel_buffer_copy.cache[i], NULL);
232    }
233 
234    if (device->meta.texel_buffer_copy.p_layout) {
235       v3dv_DestroyPipelineLayout(_device, device->meta.texel_buffer_copy.p_layout,
236                                  &device->vk.alloc);
237    }
238 
239    if (device->meta.texel_buffer_copy.ds_layout) {
240       v3dv_DestroyDescriptorSetLayout(_device, device->meta.texel_buffer_copy.ds_layout,
241                                       &device->vk.alloc);
242    }
243 }
244 
245 static VkFormat
get_compatible_tlb_format(VkFormat format)246 get_compatible_tlb_format(VkFormat format)
247 {
248    switch (format) {
249    case VK_FORMAT_R8G8B8A8_SNORM:
250       return VK_FORMAT_R8G8B8A8_UINT;
251 
252    case VK_FORMAT_R8G8_SNORM:
253       return VK_FORMAT_R8G8_UINT;
254 
255    case VK_FORMAT_R8_SNORM:
256       return VK_FORMAT_R8_UINT;
257 
258    case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
259       return VK_FORMAT_A8B8G8R8_UINT_PACK32;
260 
261    case VK_FORMAT_R16_UNORM:
262    case VK_FORMAT_R16_SNORM:
263       return VK_FORMAT_R16_UINT;
264 
265    case VK_FORMAT_R16G16_UNORM:
266    case VK_FORMAT_R16G16_SNORM:
267       return VK_FORMAT_R16G16_UINT;
268 
269    case VK_FORMAT_R16G16B16A16_UNORM:
270    case VK_FORMAT_R16G16B16A16_SNORM:
271       return VK_FORMAT_R16G16B16A16_UINT;
272 
273    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
274       return VK_FORMAT_R32_SFLOAT;
275 
276    /* We can't render to compressed formats using the TLB so instead we use
277     * a compatible format with the same bpp as the compressed format. Because
278     * the compressed format's bpp is for a full block (i.e. 4x4 pixels in the
279     * case of ETC), when we implement copies with the compatible format we
280     * will have to divide offsets and dimensions on the compressed image by
281     * the compressed block size.
282     */
283    case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
284    case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
285    case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
286    case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
287    case VK_FORMAT_BC2_UNORM_BLOCK:
288    case VK_FORMAT_BC2_SRGB_BLOCK:
289    case VK_FORMAT_BC3_SRGB_BLOCK:
290    case VK_FORMAT_BC3_UNORM_BLOCK:
291    case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
292    case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
293    case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
294    case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
295    case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
296    case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
297    case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
298    case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
299    case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
300    case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
301    case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
302    case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
303    case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
304    case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
305    case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
306    case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
307    case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
308    case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
309    case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
310    case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
311    case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
312    case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
313    case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
314    case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
315    case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
316    case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
317    case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
318    case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
319       return VK_FORMAT_R32G32B32A32_UINT;
320 
321    case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
322    case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
323    case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
324    case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
325    case VK_FORMAT_EAC_R11_UNORM_BLOCK:
326    case VK_FORMAT_EAC_R11_SNORM_BLOCK:
327    case VK_FORMAT_BC1_RGB_UNORM_BLOCK:
328    case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
329    case VK_FORMAT_BC1_RGBA_UNORM_BLOCK:
330    case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
331       return VK_FORMAT_R16G16B16A16_UINT;
332 
333    default:
334       return VK_FORMAT_UNDEFINED;
335    }
336 }
337 
338 /**
339  * Checks if we can implement an image copy or clear operation using the TLB
340  * hardware.
341  */
342 bool
v3dv_meta_can_use_tlb(struct v3dv_image * image,const VkOffset3D * offset,VkFormat * compat_format)343 v3dv_meta_can_use_tlb(struct v3dv_image *image,
344                       const VkOffset3D *offset,
345                       VkFormat *compat_format)
346 {
347    if (offset->x != 0 || offset->y != 0)
348       return false;
349 
350    if (image->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
351       if (compat_format)
352          *compat_format = image->vk.format;
353       return true;
354    }
355 
356    /* If the image format is not TLB-supported, then check if we can use
357     * a compatible format instead.
358     */
359    if (compat_format) {
360       *compat_format = get_compatible_tlb_format(image->vk.format);
361       if (*compat_format != VK_FORMAT_UNDEFINED)
362          return true;
363    }
364 
365    return false;
366 }
367 
368 /* Implements a copy using the TLB.
369  *
370  * This only works if we are copying from offset (0,0), since a TLB store for
371  * tile (x,y) will be written at the same tile offset into the destination.
372  * When this requirement is not met, we need to use a blit instead.
373  *
374  * Returns true if the implementation supports the requested operation (even if
375  * it failed to process it, for example, due to an out-of-memory error).
376  *
377  */
378 static bool
copy_image_to_buffer_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,struct v3dv_image * image,const VkBufferImageCopy2KHR * region)379 copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
380                          struct v3dv_buffer *buffer,
381                          struct v3dv_image *image,
382                          const VkBufferImageCopy2KHR *region)
383 {
384    VkFormat fb_format;
385    if (!v3dv_meta_can_use_tlb(image, &region->imageOffset, &fb_format))
386       return false;
387 
388    uint32_t internal_type, internal_bpp;
389    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
390       (fb_format, region->imageSubresource.aspectMask,
391        &internal_type, &internal_bpp);
392 
393    uint32_t num_layers;
394    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
395       num_layers = region->imageSubresource.layerCount;
396    else
397       num_layers = region->imageExtent.depth;
398    assert(num_layers > 0);
399 
400    struct v3dv_job *job =
401       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
402    if (!job)
403       return true;
404 
405    /* Handle copy from compressed format using a compatible format */
406    const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
407    const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
408    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
409    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
410 
411    v3dv_job_start_frame(job, width, height, num_layers, false,
412                         1, internal_bpp, false);
413 
414    struct v3dv_meta_framebuffer framebuffer;
415    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
416                                               internal_type, &job->frame_tiling);
417 
418    v3dv_X(job->device, job_emit_binning_flush)(job);
419    v3dv_X(job->device, meta_emit_copy_image_to_buffer_rcl)
420       (job, buffer, image, &framebuffer, region);
421 
422    v3dv_cmd_buffer_finish_job(cmd_buffer);
423 
424    return true;
425 }
426 
427 static bool
428 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
429             struct v3dv_image *dst,
430             VkFormat dst_format,
431             struct v3dv_image *src,
432             VkFormat src_format,
433             VkColorComponentFlags cmask,
434             VkComponentMapping *cswizzle,
435             const VkImageBlit2KHR *region,
436             VkFilter filter,
437             bool dst_is_padded_image);
438 
439 /**
440  * Returns true if the implementation supports the requested operation (even if
441  * it failed to process it, for example, due to an out-of-memory error).
442  */
443 static bool
copy_image_to_buffer_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,struct v3dv_image * image,const VkBufferImageCopy2KHR * region)444 copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
445                           struct v3dv_buffer *buffer,
446                           struct v3dv_image *image,
447                           const VkBufferImageCopy2KHR *region)
448 {
449    bool handled = false;
450 
451    /* Generally, the bpp of the data in the buffer matches that of the
452     * source image. The exception is the case where we are copying
453     * stencil (8bpp) to a combined d24s8 image (32bpp).
454     */
455    uint32_t buffer_bpp = image->cpp;
456 
457    VkImageAspectFlags copy_aspect = region->imageSubresource.aspectMask;
458 
459    /* Because we are going to implement the copy as a blit, we need to create
460     * a linear image from the destination buffer and we also want our blit
461     * source and destination formats to be the same (to avoid any format
462     * conversions), so we choose a canonical format that matches the
463     * source image bpp.
464     *
465     * The exception to the above is copying from combined depth/stencil images
466     * because we are copying only one aspect of the image, so we need to setup
467     * our formats, color write mask and source swizzle mask to match that.
468     */
469    VkFormat dst_format;
470    VkFormat src_format;
471    VkColorComponentFlags cmask = 0; /* All components */
472    VkComponentMapping cswizzle = {
473       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
474       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
475       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
476       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
477    };
478    switch (buffer_bpp) {
479    case 16:
480       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
481       dst_format = VK_FORMAT_R32G32B32A32_UINT;
482       src_format = dst_format;
483       break;
484    case 8:
485       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
486       dst_format = VK_FORMAT_R16G16B16A16_UINT;
487       src_format = dst_format;
488       break;
489    case 4:
490       switch (copy_aspect) {
491       case VK_IMAGE_ASPECT_COLOR_BIT:
492          src_format = VK_FORMAT_R8G8B8A8_UINT;
493          dst_format = VK_FORMAT_R8G8B8A8_UINT;
494          break;
495       case VK_IMAGE_ASPECT_DEPTH_BIT:
496          assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
497                 image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
498                 image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
499          if (image->vk.format == VK_FORMAT_D32_SFLOAT) {
500             src_format = VK_FORMAT_R32_UINT;
501             dst_format = VK_FORMAT_R32_UINT;
502          } else {
503             /* We want to write depth in the buffer in the first 24-bits,
504              * however, the hardware has depth in bits 8-31, so swizzle the
505              * the source components to match what we want. Also, we don't
506              * want to write bits 24-31 in the destination.
507              */
508             src_format = VK_FORMAT_R8G8B8A8_UINT;
509             dst_format = VK_FORMAT_R8G8B8A8_UINT;
510             cmask = VK_COLOR_COMPONENT_R_BIT |
511                     VK_COLOR_COMPONENT_G_BIT |
512                     VK_COLOR_COMPONENT_B_BIT;
513             cswizzle.r = VK_COMPONENT_SWIZZLE_G;
514             cswizzle.g = VK_COMPONENT_SWIZZLE_B;
515             cswizzle.b = VK_COMPONENT_SWIZZLE_A;
516             cswizzle.a = VK_COMPONENT_SWIZZLE_ZERO;
517          }
518          break;
519       case VK_IMAGE_ASPECT_STENCIL_BIT:
520          assert(copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
521          assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
522          /* Copying from S8D24. We want to write 8-bit stencil values only,
523           * so adjust the buffer bpp for that. Since the hardware stores stencil
524           * in the LSB, we can just do a RGBA8UI to R8UI blit.
525           */
526          src_format = VK_FORMAT_R8G8B8A8_UINT;
527          dst_format = VK_FORMAT_R8_UINT;
528          buffer_bpp = 1;
529          break;
530       default:
531          unreachable("unsupported aspect");
532          return handled;
533       };
534       break;
535    case 2:
536       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
537              copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
538       dst_format = VK_FORMAT_R16_UINT;
539       src_format = dst_format;
540       break;
541    case 1:
542       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
543       dst_format = VK_FORMAT_R8_UINT;
544       src_format = dst_format;
545       break;
546    default:
547       unreachable("unsupported bit-size");
548       return handled;
549    };
550 
551    /* The hardware doesn't support linear depth/stencil stores, so we
552     * implement copies of depth/stencil aspect as color copies using a
553     * compatible color format.
554     */
555    assert(vk_format_is_color(src_format));
556    assert(vk_format_is_color(dst_format));
557    copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
558 
559    /* We should be able to handle the blit if we got this far */
560    handled = true;
561 
562    /* Obtain the 2D buffer region spec */
563    uint32_t buf_width, buf_height;
564    if (region->bufferRowLength == 0)
565       buf_width = region->imageExtent.width;
566    else
567       buf_width = region->bufferRowLength;
568 
569    if (region->bufferImageHeight == 0)
570       buf_height = region->imageExtent.height;
571    else
572       buf_height = region->bufferImageHeight;
573 
574    /* If the image is compressed, the bpp refers to blocks, not pixels */
575    uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
576    uint32_t block_height = vk_format_get_blockheight(image->vk.format);
577    buf_width = buf_width / block_width;
578    buf_height = buf_height / block_height;
579 
580    /* Compute layers to copy */
581    uint32_t num_layers;
582    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
583       num_layers = region->imageSubresource.layerCount;
584    else
585       num_layers = region->imageExtent.depth;
586    assert(num_layers > 0);
587 
588    /* Our blit interface can see the real format of the images to detect
589     * copies between compressed and uncompressed images and adapt the
590     * blit region accordingly. Here we are just doing a raw copy of
591     * compressed data, but we are passing an uncompressed view of the
592     * buffer for the blit destination image (since compressed formats are
593     * not renderable), so we also want to provide an uncompressed view of
594     * the source image.
595     */
596    VkResult result;
597    struct v3dv_device *device = cmd_buffer->device;
598    VkDevice _device = v3dv_device_to_handle(device);
599    if (vk_format_is_compressed(image->vk.format)) {
600       VkImage uiview;
601       VkImageCreateInfo uiview_info = {
602          .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
603          .imageType = VK_IMAGE_TYPE_3D,
604          .format = dst_format,
605          .extent = { buf_width, buf_height, image->vk.extent.depth },
606          .mipLevels = image->vk.mip_levels,
607          .arrayLayers = image->vk.array_layers,
608          .samples = image->vk.samples,
609          .tiling = image->vk.tiling,
610          .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
611          .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
612          .queueFamilyIndexCount = 0,
613          .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
614       };
615       result = v3dv_CreateImage(_device, &uiview_info, &device->vk.alloc, &uiview);
616       if (result != VK_SUCCESS)
617          return handled;
618 
619       v3dv_cmd_buffer_add_private_obj(
620          cmd_buffer, (uintptr_t)uiview,
621          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
622 
623       result =
624          vk_common_BindImageMemory(_device, uiview,
625                                    v3dv_device_memory_to_handle(image->mem),
626                                    image->mem_offset);
627       if (result != VK_SUCCESS)
628          return handled;
629 
630       image = v3dv_image_from_handle(uiview);
631    }
632 
633    /* Copy requested layers */
634    for (uint32_t i = 0; i < num_layers; i++) {
635       /* Create the destination blit image from the destination buffer */
636       VkImageCreateInfo image_info = {
637          .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
638          .imageType = VK_IMAGE_TYPE_2D,
639          .format = dst_format,
640          .extent = { buf_width, buf_height, 1 },
641          .mipLevels = 1,
642          .arrayLayers = 1,
643          .samples = VK_SAMPLE_COUNT_1_BIT,
644          .tiling = VK_IMAGE_TILING_LINEAR,
645          .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
646          .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
647          .queueFamilyIndexCount = 0,
648          .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
649       };
650 
651       VkImage buffer_image;
652       result =
653          v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
654       if (result != VK_SUCCESS)
655          return handled;
656 
657       v3dv_cmd_buffer_add_private_obj(
658          cmd_buffer, (uintptr_t)buffer_image,
659          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
660 
661       /* Bind the buffer memory to the image */
662       VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
663          i * buf_width * buf_height * buffer_bpp;
664       result =
665          vk_common_BindImageMemory(_device, buffer_image,
666                                    v3dv_device_memory_to_handle(buffer->mem),
667                                    buffer_offset);
668       if (result != VK_SUCCESS)
669          return handled;
670 
671       /* Blit-copy the requested image extent.
672        *
673        * Since we are copying, the blit must use the same format on the
674        * destination and source images to avoid format conversions. The
675        * only exception is copying stencil, which we upload to a R8UI source
676        * image, but that we need to blit to a S8D24 destination (the only
677        * stencil format we support).
678        */
679       const VkImageBlit2KHR blit_region = {
680          .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
681          .srcSubresource = {
682             .aspectMask = copy_aspect,
683             .mipLevel = region->imageSubresource.mipLevel,
684             .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
685             .layerCount = 1,
686          },
687          .srcOffsets = {
688             {
689                DIV_ROUND_UP(region->imageOffset.x, block_width),
690                DIV_ROUND_UP(region->imageOffset.y, block_height),
691                region->imageOffset.z + i,
692             },
693             {
694                DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
695                             block_width),
696                DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
697                             block_height),
698                region->imageOffset.z + i + 1,
699             },
700          },
701          .dstSubresource = {
702             .aspectMask = copy_aspect,
703             .mipLevel = 0,
704             .baseArrayLayer = 0,
705             .layerCount = 1,
706          },
707          .dstOffsets = {
708             { 0, 0, 0 },
709             {
710                DIV_ROUND_UP(region->imageExtent.width, block_width),
711                DIV_ROUND_UP(region->imageExtent.height, block_height),
712                1
713             },
714          },
715       };
716 
717       handled = blit_shader(cmd_buffer,
718                             v3dv_image_from_handle(buffer_image), dst_format,
719                             image, src_format,
720                             cmask, &cswizzle,
721                             &blit_region, VK_FILTER_NEAREST, false);
722       if (!handled) {
723          /* This is unexpected, we should have a supported blit spec */
724          unreachable("Unable to blit buffer to destination image");
725          return false;
726       }
727    }
728 
729    assert(handled);
730    return true;
731 }
732 
733 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2KHR * info)734 v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
735                               const VkCopyImageToBufferInfo2KHR *info)
736 
737 {
738    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
739    V3DV_FROM_HANDLE(v3dv_image, image, info->srcImage);
740    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->dstBuffer);
741 
742    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
743 
744    for (uint32_t i = 0; i < info->regionCount; i++) {
745       if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, &info->pRegions[i]))
746          continue;
747       if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, &info->pRegions[i]))
748          continue;
749       unreachable("Unsupported image to buffer copy.");
750    }
751 }
752 
753 /**
754  * Returns true if the implementation supports the requested operation (even if
755  * it failed to process it, for example, due to an out-of-memory error).
756  */
757 static bool
copy_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2KHR * region)758 copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
759                struct v3dv_image *dst,
760                struct v3dv_image *src,
761                const VkImageCopy2KHR *region)
762 {
763    /* Destination can't be raster format */
764    if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
765       return false;
766 
767    /* We can only do full copies, so if the format is D24S8 both aspects need
768     * to be copied. We only need to check the dst format because the spec
769     * states that depth/stencil formats must match exactly.
770     */
771    if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
772        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
773                                              VK_IMAGE_ASPECT_STENCIL_BIT;
774        if (region->dstSubresource.aspectMask != ds_aspects)
775          return false;
776    }
777 
778    /* Don't handle copies between uncompressed and compressed formats for now.
779     *
780     * FIXME: we should be able to handle these easily but there is no coverage
781     * in CTS at the moment that make such copies with full images (which we
782     * require here), only partial copies. Also, in that case the code below that
783     * checks for "dst image complete" requires some changes, since it is
784     * checking against the region dimensions, which are in units of the source
785     * image format.
786     */
787    if (vk_format_is_compressed(dst->vk.format) !=
788        vk_format_is_compressed(src->vk.format)) {
789       return false;
790    }
791 
792    /* Source region must start at (0,0) */
793    if (region->srcOffset.x != 0 || region->srcOffset.y != 0)
794       return false;
795 
796    /* Destination image must be complete */
797    if (region->dstOffset.x != 0 || region->dstOffset.y != 0)
798       return false;
799 
800    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
801    uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
802    uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
803    if (region->extent.width != dst_width || region->extent.height != dst_height)
804       return false;
805 
806    /* From vkCmdCopyImage:
807     *
808     *   "When copying between compressed and uncompressed formats the extent
809     *    members represent the texel dimensions of the source image and not
810     *    the destination."
811     */
812    const uint32_t block_w = vk_format_get_blockwidth(src->vk.format);
813    const uint32_t block_h = vk_format_get_blockheight(src->vk.format);
814    uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
815    uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
816 
817    /* Account for sample count */
818    assert(dst->vk.samples == src->vk.samples);
819    if (dst->vk.samples > VK_SAMPLE_COUNT_1_BIT) {
820       assert(dst->vk.samples == VK_SAMPLE_COUNT_4_BIT);
821       width *= 2;
822       height *= 2;
823    }
824 
825    /* The TFU unit doesn't handle format conversions so we need the formats to
826     * match. On the other hand, vkCmdCopyImage allows different color formats
827     * on the source and destination images, but only if they are texel
828     * compatible. For us, this means that we can effectively ignore different
829     * formats and just make the copy using either of them, since we are just
830     * moving raw data and not making any conversions.
831     *
832     * Also, the formats supported by the TFU unit are limited, but again, since
833     * we are only doing raw copies here without interpreting or converting
834     * the underlying pixel data according to its format, we can always choose
835     * to use compatible formats that are supported with the TFU unit.
836     */
837    assert(dst->cpp == src->cpp);
838    const struct v3dv_format *format =
839       v3dv_get_compatible_tfu_format(cmd_buffer->device,
840                                      dst->cpp, NULL);
841 
842    /* Emit a TFU job for each layer to blit */
843    const uint32_t layer_count = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
844       region->dstSubresource.layerCount :
845       region->extent.depth;
846    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
847 
848    const uint32_t base_src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
849       region->srcSubresource.baseArrayLayer : region->srcOffset.z;
850    const uint32_t base_dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
851       region->dstSubresource.baseArrayLayer : region->dstOffset.z;
852    for (uint32_t i = 0; i < layer_count; i++) {
853       v3dv_X(cmd_buffer->device, meta_emit_tfu_job)
854          (cmd_buffer, dst, dst_mip_level, base_dst_layer + i,
855           src, src_mip_level, base_src_layer + i,
856           width, height, format);
857    }
858 
859    return true;
860 }
861 
862 /**
863  * Returns true if the implementation supports the requested operation (even if
864  * it failed to process it, for example, due to an out-of-memory error).
865  */
866 static bool
copy_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2KHR * region)867 copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
868                struct v3dv_image *dst,
869                struct v3dv_image *src,
870                const VkImageCopy2KHR *region)
871 {
872    VkFormat fb_format;
873    if (!v3dv_meta_can_use_tlb(src, &region->srcOffset, &fb_format) ||
874        !v3dv_meta_can_use_tlb(dst, &region->dstOffset, &fb_format)) {
875       return false;
876    }
877 
878    /* From the Vulkan spec, VkImageCopy valid usage:
879     *
880     *    "If neither the calling command’s srcImage nor the calling command’s
881     *     dstImage has a multi-planar image format then the aspectMask member
882     *     of srcSubresource and dstSubresource must match."
883     */
884    assert(region->dstSubresource.aspectMask ==
885           region->srcSubresource.aspectMask);
886    uint32_t internal_type, internal_bpp;
887    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
888       (fb_format, region->dstSubresource.aspectMask,
889        &internal_type, &internal_bpp);
890 
891    /* From the Vulkan spec with VK_KHR_maintenance1, VkImageCopy valid usage:
892     *
893     * "The number of slices of the extent (for 3D) or layers of the
894     *  srcSubresource (for non-3D) must match the number of slices of the
895     *  extent (for 3D) or layers of the dstSubresource (for non-3D)."
896     */
897    assert((src->vk.image_type != VK_IMAGE_TYPE_3D ?
898            region->srcSubresource.layerCount : region->extent.depth) ==
899           (dst->vk.image_type != VK_IMAGE_TYPE_3D ?
900            region->dstSubresource.layerCount : region->extent.depth));
901    uint32_t num_layers;
902    if (dst->vk.image_type != VK_IMAGE_TYPE_3D)
903       num_layers = region->dstSubresource.layerCount;
904    else
905       num_layers = region->extent.depth;
906    assert(num_layers > 0);
907 
908    struct v3dv_job *job =
909       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
910    if (!job)
911       return true;
912 
913    /* Handle copy to compressed image using compatible format */
914    const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
915    const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
916    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
917    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
918 
919    v3dv_job_start_frame(job, width, height, num_layers, false, 1, internal_bpp,
920                         src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
921 
922    struct v3dv_meta_framebuffer framebuffer;
923    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
924                                               internal_type, &job->frame_tiling);
925 
926    v3dv_X(job->device, job_emit_binning_flush)(job);
927    v3dv_X(job->device, meta_emit_copy_image_rcl)(job, dst, src, &framebuffer, region);
928 
929    v3dv_cmd_buffer_finish_job(cmd_buffer);
930 
931    return true;
932 }
933 
934 /**
935  * Takes the image provided as argument and creates a new image that has
936  * the same specification and aliases the same memory storage, except that:
937  *
938  *   - It has the uncompressed format passed in.
939  *   - Its original width/height are scaled by the factors passed in.
940  *
941  * This is useful to implement copies from compressed images using the blit
942  * path. The idea is that we create uncompressed "image views" of both the
943  * source and destination images using the uncompressed format and then we
944  * define the copy blit in terms of that format.
945  */
946 static struct v3dv_image *
create_image_alias(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * src,float width_scale,float height_scale,VkFormat format)947 create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
948                    struct v3dv_image *src,
949                    float width_scale,
950                    float height_scale,
951                    VkFormat format)
952 {
953    assert(!vk_format_is_compressed(format));
954 
955    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
956 
957    VkImageCreateInfo info = {
958       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
959       .imageType = src->vk.image_type,
960       .format = format,
961       .extent = {
962          .width = src->vk.extent.width * width_scale,
963          .height = src->vk.extent.height * height_scale,
964          .depth = src->vk.extent.depth,
965       },
966       .mipLevels = src->vk.mip_levels,
967       .arrayLayers = src->vk.array_layers,
968       .samples = src->vk.samples,
969       .tiling = src->vk.tiling,
970       .usage = src->vk.usage,
971    };
972 
973     VkImage _image;
974     VkResult result =
975       v3dv_CreateImage(_device, &info, &cmd_buffer->device->vk.alloc, &_image);
976     if (result != VK_SUCCESS) {
977        v3dv_flag_oom(cmd_buffer, NULL);
978        return NULL;
979     }
980 
981     struct v3dv_image *image = v3dv_image_from_handle(_image);
982     image->mem = src->mem;
983     image->mem_offset = src->mem_offset;
984     return image;
985 }
986 
987 /**
988  * Returns true if the implementation supports the requested operation (even if
989  * it failed to process it, for example, due to an out-of-memory error).
990  */
991 static bool
copy_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2KHR * region)992 copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
993                 struct v3dv_image *dst,
994                 struct v3dv_image *src,
995                 const VkImageCopy2KHR *region)
996 {
997    const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
998    const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
999    const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
1000    const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
1001    const float block_scale_w = (float)src_block_w / (float)dst_block_w;
1002    const float block_scale_h = (float)src_block_h / (float)dst_block_h;
1003 
1004    /* We need to choose a single format for the blit to ensure that this is
1005     * really a copy and there are not format conversions going on. Since we
1006     * going to blit, we need to make sure that the selected format can be
1007     * both rendered to and textured from.
1008     */
1009    VkFormat format;
1010    float src_scale_w = 1.0f;
1011    float src_scale_h = 1.0f;
1012    float dst_scale_w = block_scale_w;
1013    float dst_scale_h = block_scale_h;
1014    if (vk_format_is_compressed(src->vk.format)) {
1015       /* If we are copying from a compressed format we should be aware that we
1016        * are going to texture from the source image, and the texture setup
1017        * knows the actual size of the image, so we need to choose a format
1018        * that has a per-texel (not per-block) bpp that is compatible for that
1019        * image size. For example, for a source image with size Bw*WxBh*H
1020        * and format ETC2_RGBA8_UNORM copied to a WxH image of format RGBA32UI,
1021        * each of the Bw*WxBh*H texels in the compressed source image is 8-bit
1022        * (which translates to a 128-bit 4x4 RGBA32 block when uncompressed),
1023        * so we could specify a blit with size Bw*WxBh*H and a format with
1024        * a bpp of 8-bit per texel (R8_UINT).
1025        *
1026        * Unfortunately, when copying from a format like ETC2_RGB8A1_UNORM,
1027        * which is 64-bit per texel, then we would need a 4-bit format, which
1028        * we don't have, so instead we still choose an 8-bit format, but we
1029        * apply a divisor to the row dimensions of the blit, since we are
1030        * copying two texels per item.
1031        *
1032        * Generally, we can choose any format so long as we compute appropriate
1033        * divisors for the width and height depending on the source image's
1034        * bpp.
1035        */
1036       assert(src->cpp == dst->cpp);
1037 
1038       format = VK_FORMAT_R32G32_UINT;
1039       switch (src->cpp) {
1040       case 16:
1041          format = VK_FORMAT_R32G32B32A32_UINT;
1042          break;
1043       case 8:
1044          format = VK_FORMAT_R16G16B16A16_UINT;
1045          break;
1046       default:
1047          unreachable("Unsupported compressed format");
1048       }
1049 
1050       /* Create image views of the src/dst images that we can interpret in
1051        * terms of the canonical format.
1052        */
1053       src_scale_w /= src_block_w;
1054       src_scale_h /= src_block_h;
1055       dst_scale_w /= src_block_w;
1056       dst_scale_h /= src_block_h;
1057 
1058       src = create_image_alias(cmd_buffer, src,
1059                                src_scale_w, src_scale_h, format);
1060 
1061       dst = create_image_alias(cmd_buffer, dst,
1062                                dst_scale_w, dst_scale_h, format);
1063    } else {
1064       format = src->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
1065          src->vk.format : get_compatible_tlb_format(src->vk.format);
1066       if (format == VK_FORMAT_UNDEFINED)
1067          return false;
1068 
1069       const struct v3dv_format *f = v3dv_X(cmd_buffer->device, get_format)(format);
1070       if (!f->supported || f->tex_type == TEXTURE_DATA_FORMAT_NO)
1071          return false;
1072    }
1073 
1074    /* Given an uncompressed image with size WxH, if we copy it to a compressed
1075     * image, it will result in an image with size W*bWxH*bH, where bW and bH
1076     * are the compressed format's block width and height. This means that
1077     * copies between compressed and uncompressed images involve different
1078     * image sizes, and therefore, we need to take that into account when
1079     * setting up the source and destination blit regions below, so they are
1080     * consistent from the point of view of the single compatible format
1081     * selected for the copy.
1082     *
1083     * We should take into account that the dimensions of the region provided
1084     * to the copy command are specified in terms of the source image. With that
1085     * in mind, below we adjust the blit destination region to be consistent with
1086     * the source region for the compatible format, so basically, we apply
1087     * the block scale factor to the destination offset provided by the copy
1088     * command (because it is specified in terms of the destination image, not
1089     * the source), and then we just add the region copy dimensions to that
1090     * (since the region dimensions are already specified in terms of the source
1091     * image).
1092     */
1093    const VkOffset3D src_start = {
1094       region->srcOffset.x * src_scale_w,
1095       region->srcOffset.y * src_scale_h,
1096       region->srcOffset.z,
1097    };
1098    const VkOffset3D src_end = {
1099       src_start.x + region->extent.width * src_scale_w,
1100       src_start.y + region->extent.height * src_scale_h,
1101       src_start.z + region->extent.depth,
1102    };
1103 
1104    const VkOffset3D dst_start = {
1105       region->dstOffset.x * dst_scale_w,
1106       region->dstOffset.y * dst_scale_h,
1107       region->dstOffset.z,
1108    };
1109    const VkOffset3D dst_end = {
1110       dst_start.x + region->extent.width * src_scale_w,
1111       dst_start.y + region->extent.height * src_scale_h,
1112       dst_start.z + region->extent.depth,
1113    };
1114 
1115    const VkImageBlit2KHR blit_region = {
1116       .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
1117       .srcSubresource = region->srcSubresource,
1118       .srcOffsets = { src_start, src_end },
1119       .dstSubresource = region->dstSubresource,
1120       .dstOffsets = { dst_start, dst_end },
1121    };
1122    bool handled = blit_shader(cmd_buffer,
1123                               dst, format,
1124                               src, format,
1125                               0, NULL,
1126                               &blit_region, VK_FILTER_NEAREST, true);
1127 
1128    /* We should have selected formats that we can blit */
1129    assert(handled);
1130    return handled;
1131 }
1132 
1133 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,const VkCopyImageInfo2KHR * info)1134 v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
1135                       const VkCopyImageInfo2KHR *info)
1136 
1137 {
1138    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1139    V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
1140    V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
1141 
1142    assert(src->vk.samples == dst->vk.samples);
1143 
1144    for (uint32_t i = 0; i < info->regionCount; i++) {
1145       if (copy_image_tfu(cmd_buffer, dst, src, &info->pRegions[i]))
1146          continue;
1147       if (copy_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
1148          continue;
1149       if (copy_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
1150          continue;
1151       unreachable("Image copy not supported");
1152    }
1153 }
1154 
1155 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2KHR * pCopyBufferInfo)1156 v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
1157                        const VkCopyBufferInfo2KHR *pCopyBufferInfo)
1158 {
1159    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1160    V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
1161    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
1162 
1163    for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) {
1164       v3dv_X(cmd_buffer->device, meta_copy_buffer)
1165          (cmd_buffer,
1166           dst_buffer->mem->bo, dst_buffer->mem_offset,
1167           src_buffer->mem->bo, src_buffer->mem_offset,
1168           &pCopyBufferInfo->pRegions[i]);
1169    }
1170 }
1171 
1172 static void
destroy_update_buffer_cb(VkDevice _device,uint64_t pobj,VkAllocationCallbacks * alloc)1173 destroy_update_buffer_cb(VkDevice _device,
1174                          uint64_t pobj,
1175                          VkAllocationCallbacks *alloc)
1176 {
1177    V3DV_FROM_HANDLE(v3dv_device, device, _device);
1178    struct v3dv_bo *bo = (struct v3dv_bo *)((uintptr_t) pobj);
1179    v3dv_bo_free(device, bo);
1180 }
1181 
1182 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)1183 v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1184                      VkBuffer dstBuffer,
1185                      VkDeviceSize dstOffset,
1186                      VkDeviceSize dataSize,
1187                      const void *pData)
1188 {
1189    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1190    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1191 
1192    struct v3dv_bo *src_bo =
1193       v3dv_bo_alloc(cmd_buffer->device, dataSize, "vkCmdUpdateBuffer", true);
1194    if (!src_bo) {
1195       fprintf(stderr, "Failed to allocate BO for vkCmdUpdateBuffer.\n");
1196       return;
1197    }
1198 
1199    bool ok = v3dv_bo_map(cmd_buffer->device, src_bo, src_bo->size);
1200    if (!ok) {
1201       fprintf(stderr, "Failed to map BO for vkCmdUpdateBuffer.\n");
1202       return;
1203    }
1204 
1205    memcpy(src_bo->map, pData, dataSize);
1206 
1207    v3dv_bo_unmap(cmd_buffer->device, src_bo);
1208 
1209    VkBufferCopy2KHR region = {
1210       .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2_KHR,
1211       .srcOffset = 0,
1212       .dstOffset = dstOffset,
1213       .size = dataSize,
1214    };
1215    struct v3dv_job *copy_job =
1216       v3dv_X(cmd_buffer->device, meta_copy_buffer)
1217       (cmd_buffer, dst_buffer->mem->bo, dst_buffer->mem_offset,
1218        src_bo, 0, &region);
1219 
1220    if (!copy_job)
1221       return;
1222 
1223    v3dv_cmd_buffer_add_private_obj(
1224       cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
1225 }
1226 
1227 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize size,uint32_t data)1228 v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
1229                    VkBuffer dstBuffer,
1230                    VkDeviceSize dstOffset,
1231                    VkDeviceSize size,
1232                    uint32_t data)
1233 {
1234    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1235    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1236 
1237    struct v3dv_bo *bo = dst_buffer->mem->bo;
1238 
1239    /* From the Vulkan spec:
1240     *
1241     *   "If VK_WHOLE_SIZE is used and the remaining size of the buffer is not
1242     *    a multiple of 4, then the nearest smaller multiple is used."
1243     */
1244    if (size == VK_WHOLE_SIZE) {
1245       size = dst_buffer->size - dstOffset;
1246       size -= size % 4;
1247    }
1248 
1249    v3dv_X(cmd_buffer->device, meta_fill_buffer)
1250       (cmd_buffer, bo, dstOffset, size, data);
1251 }
1252 
1253 /**
1254  * Returns true if the implementation supports the requested operation (even if
1255  * it failed to process it, for example, due to an out-of-memory error).
1256  */
1257 static bool
copy_buffer_to_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2KHR * region)1258 copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
1259                          struct v3dv_image *image,
1260                          struct v3dv_buffer *buffer,
1261                          const VkBufferImageCopy2KHR *region)
1262 {
1263    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
1264 
1265    /* Destination can't be raster format */
1266    if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
1267       return false;
1268 
1269    /* We can't copy D24S8 because buffer to image copies only copy one aspect
1270     * at a time, and the TFU copies full images. Also, V3D depth bits for
1271     * both D24S8 and D24X8 stored in the 24-bit MSB of each 32-bit word, but
1272     * the Vulkan spec has the buffer data specified the other way around, so it
1273     * is not a straight copy, we would havew to swizzle the channels, which the
1274     * TFU can't do.
1275     */
1276    if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
1277        image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1278          return false;
1279    }
1280 
1281    /* Region must include full slice */
1282    const uint32_t offset_x = region->imageOffset.x;
1283    const uint32_t offset_y = region->imageOffset.y;
1284    if (offset_x != 0 || offset_y != 0)
1285       return false;
1286 
1287    uint32_t width, height;
1288    if (region->bufferRowLength == 0)
1289       width = region->imageExtent.width;
1290    else
1291       width = region->bufferRowLength;
1292 
1293    if (region->bufferImageHeight == 0)
1294       height = region->imageExtent.height;
1295    else
1296       height = region->bufferImageHeight;
1297 
1298    if (width != image->vk.extent.width || height != image->vk.extent.height)
1299       return false;
1300 
1301    /* Handle region semantics for compressed images */
1302    const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
1303    const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
1304    width = DIV_ROUND_UP(width, block_w);
1305    height = DIV_ROUND_UP(height, block_h);
1306 
1307    /* Format must be supported for texturing via the TFU. Since we are just
1308     * copying raw data and not converting between pixel formats, we can ignore
1309     * the image's format and choose a compatible TFU format for the image
1310     * texel size instead, which expands the list of formats we can handle here.
1311     */
1312    const struct v3dv_format *format =
1313       v3dv_get_compatible_tfu_format(cmd_buffer->device,
1314                                      image->cpp, NULL);
1315 
1316    const uint32_t mip_level = region->imageSubresource.mipLevel;
1317    const struct v3d_resource_slice *slice = &image->slices[mip_level];
1318 
1319    uint32_t num_layers;
1320    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
1321       num_layers = region->imageSubresource.layerCount;
1322    else
1323       num_layers = region->imageExtent.depth;
1324    assert(num_layers > 0);
1325 
1326    assert(image->mem && image->mem->bo);
1327    const struct v3dv_bo *dst_bo = image->mem->bo;
1328 
1329    assert(buffer->mem && buffer->mem->bo);
1330    const struct v3dv_bo *src_bo = buffer->mem->bo;
1331 
1332    /* Emit a TFU job per layer to copy */
1333    const uint32_t buffer_stride = width * image->cpp;
1334    for (int i = 0; i < num_layers; i++) {
1335       uint32_t layer;
1336       if (image->vk.image_type != VK_IMAGE_TYPE_3D)
1337          layer = region->imageSubresource.baseArrayLayer + i;
1338       else
1339          layer = region->imageOffset.z + i;
1340 
1341       struct drm_v3d_submit_tfu tfu = {
1342          .ios = (height << 16) | width,
1343          .bo_handles = {
1344             dst_bo->handle,
1345             src_bo->handle != dst_bo->handle ? src_bo->handle : 0
1346          },
1347       };
1348 
1349       const uint32_t buffer_offset =
1350          buffer->mem_offset + region->bufferOffset +
1351          height * buffer_stride * i;
1352 
1353       const uint32_t src_offset = src_bo->offset + buffer_offset;
1354       tfu.iia |= src_offset;
1355       tfu.icfg |= V3D_TFU_ICFG_FORMAT_RASTER << V3D_TFU_ICFG_FORMAT_SHIFT;
1356       tfu.iis |= width;
1357 
1358       const uint32_t dst_offset =
1359          dst_bo->offset + v3dv_layer_offset(image, mip_level, layer);
1360       tfu.ioa |= dst_offset;
1361 
1362       tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
1363                   (slice->tiling - V3D_TILING_LINEARTILE)) <<
1364                    V3D_TFU_IOA_FORMAT_SHIFT;
1365       tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
1366 
1367       /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
1368        * OPAD field for the destination (how many extra UIF blocks beyond
1369        * those necessary to cover the height).
1370        */
1371       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
1372           slice->tiling == V3D_TILING_UIF_XOR) {
1373          uint32_t uif_block_h = 2 * v3d_utile_height(image->cpp);
1374          uint32_t implicit_padded_height = align(height, uif_block_h);
1375          uint32_t icfg =
1376             (slice->padded_height - implicit_padded_height) / uif_block_h;
1377          tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
1378       }
1379 
1380       v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
1381    }
1382 
1383    return true;
1384 }
1385 
1386 /**
1387  * Returns true if the implementation supports the requested operation (even if
1388  * it failed to process it, for example, due to an out-of-memory error).
1389  */
1390 static bool
copy_buffer_to_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2KHR * region)1391 copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
1392                          struct v3dv_image *image,
1393                          struct v3dv_buffer *buffer,
1394                          const VkBufferImageCopy2KHR *region)
1395 {
1396    VkFormat fb_format;
1397    if (!v3dv_meta_can_use_tlb(image, &region->imageOffset, &fb_format))
1398       return false;
1399 
1400    uint32_t internal_type, internal_bpp;
1401    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
1402       (fb_format, region->imageSubresource.aspectMask,
1403        &internal_type, &internal_bpp);
1404 
1405    uint32_t num_layers;
1406    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
1407       num_layers = region->imageSubresource.layerCount;
1408    else
1409       num_layers = region->imageExtent.depth;
1410    assert(num_layers > 0);
1411 
1412    struct v3dv_job *job =
1413       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1414    if (!job)
1415       return true;
1416 
1417    /* Handle copy to compressed format using a compatible format */
1418    const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
1419    const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
1420    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
1421    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
1422 
1423    v3dv_job_start_frame(job, width, height, num_layers, false,
1424                         1, internal_bpp, false);
1425 
1426    struct v3dv_meta_framebuffer framebuffer;
1427    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
1428                                               internal_type, &job->frame_tiling);
1429 
1430    v3dv_X(job->device, job_emit_binning_flush)(job);
1431    v3dv_X(job->device, meta_emit_copy_buffer_to_image_rcl)
1432       (job, image, buffer, &framebuffer, region);
1433 
1434    v3dv_cmd_buffer_finish_job(cmd_buffer);
1435 
1436    return true;
1437 }
1438 
1439 static bool
create_tiled_image_from_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2KHR * region)1440 create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
1441                                struct v3dv_image *image,
1442                                struct v3dv_buffer *buffer,
1443                                const VkBufferImageCopy2KHR *region)
1444 {
1445    if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region))
1446       return true;
1447    if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, region))
1448       return true;
1449    return false;
1450 }
1451 
1452 static VkResult
create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)1453 create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
1454 {
1455    /* If this is not the first pool we create for this command buffer
1456     * size it based on the size of the currently exhausted pool.
1457     */
1458    uint32_t descriptor_count = 64;
1459    if (cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE) {
1460       struct v3dv_descriptor_pool *exhausted_pool =
1461          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.texel_buffer_copy.dspool);
1462       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
1463    }
1464 
1465    /* Create the descriptor pool */
1466    cmd_buffer->meta.texel_buffer_copy.dspool = VK_NULL_HANDLE;
1467    VkDescriptorPoolSize pool_size = {
1468       .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
1469       .descriptorCount = descriptor_count,
1470    };
1471    VkDescriptorPoolCreateInfo info = {
1472       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
1473       .maxSets = descriptor_count,
1474       .poolSizeCount = 1,
1475       .pPoolSizes = &pool_size,
1476       .flags = 0,
1477    };
1478    VkResult result =
1479       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
1480                                 &info,
1481                                 &cmd_buffer->device->vk.alloc,
1482                                 &cmd_buffer->meta.texel_buffer_copy.dspool);
1483 
1484    if (result == VK_SUCCESS) {
1485       assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
1486       const VkDescriptorPool _pool = cmd_buffer->meta.texel_buffer_copy.dspool;
1487 
1488       v3dv_cmd_buffer_add_private_obj(
1489          cmd_buffer, (uintptr_t) _pool,
1490          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
1491 
1492       struct v3dv_descriptor_pool *pool =
1493          v3dv_descriptor_pool_from_handle(_pool);
1494       pool->is_driver_internal = true;
1495    }
1496 
1497    return result;
1498 }
1499 
1500 static VkResult
allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)1501 allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
1502                                           VkDescriptorSet *set)
1503 {
1504    /* Make sure we have a descriptor pool */
1505    VkResult result;
1506    if (cmd_buffer->meta.texel_buffer_copy.dspool == VK_NULL_HANDLE) {
1507       result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
1508       if (result != VK_SUCCESS)
1509          return result;
1510    }
1511    assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
1512 
1513    /* Allocate descriptor set */
1514    struct v3dv_device *device = cmd_buffer->device;
1515    VkDevice _device = v3dv_device_to_handle(device);
1516    VkDescriptorSetAllocateInfo info = {
1517       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
1518       .descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool,
1519       .descriptorSetCount = 1,
1520       .pSetLayouts = &device->meta.texel_buffer_copy.ds_layout,
1521    };
1522    result = v3dv_AllocateDescriptorSets(_device, &info, set);
1523 
1524    /* If we ran out of pool space, grow the pool and try again */
1525    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
1526       result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
1527       if (result == VK_SUCCESS) {
1528          info.descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool;
1529          result = v3dv_AllocateDescriptorSets(_device, &info, set);
1530       }
1531    }
1532 
1533    return result;
1534 }
1535 
1536 static void
get_texel_buffer_copy_pipeline_cache_key(VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,bool is_layered,uint8_t * key)1537 get_texel_buffer_copy_pipeline_cache_key(VkFormat format,
1538                                          VkColorComponentFlags cmask,
1539                                          VkComponentMapping *cswizzle,
1540                                          bool is_layered,
1541                                          uint8_t *key)
1542 {
1543    memset(key, 0, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
1544 
1545    uint32_t *p = (uint32_t *) key;
1546 
1547    *p = format;
1548    p++;
1549 
1550    *p = cmask;
1551    p++;
1552 
1553    /* Note that that we are using a single byte for this, so we could pack
1554     * more data into this 32-bit slot in the future.
1555     */
1556    *p = is_layered ? 1 : 0;
1557    p++;
1558 
1559    memcpy(p, cswizzle, sizeof(VkComponentMapping));
1560    p += sizeof(VkComponentMapping) / sizeof(uint32_t);
1561 
1562    assert(((uint8_t*)p - key) == V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
1563 }
1564 
1565 static bool
1566 create_blit_render_pass(struct v3dv_device *device,
1567                         VkFormat dst_format,
1568                         VkFormat src_format,
1569                         VkRenderPass *pass_load,
1570                         VkRenderPass *pass_no_load);
1571 
1572 static nir_ssa_def *gen_rect_vertices(nir_builder *b);
1573 
1574 static bool
1575 create_pipeline(struct v3dv_device *device,
1576                 struct v3dv_render_pass *pass,
1577                 struct nir_shader *vs_nir,
1578                 struct nir_shader *gs_nir,
1579                 struct nir_shader *fs_nir,
1580                 const VkPipelineVertexInputStateCreateInfo *vi_state,
1581                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
1582                 const VkPipelineColorBlendStateCreateInfo *cb_state,
1583                 const VkPipelineMultisampleStateCreateInfo *ms_state,
1584                 const VkPipelineLayout layout,
1585                 VkPipeline *pipeline);
1586 
1587 static nir_shader *
get_texel_buffer_copy_vs()1588 get_texel_buffer_copy_vs()
1589 {
1590    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1591    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
1592                                                   "meta texel buffer copy vs");
1593    nir_variable *vs_out_pos =
1594       nir_variable_create(b.shader, nir_var_shader_out,
1595                           glsl_vec4_type(), "gl_Position");
1596    vs_out_pos->data.location = VARYING_SLOT_POS;
1597 
1598    nir_ssa_def *pos = gen_rect_vertices(&b);
1599    nir_store_var(&b, vs_out_pos, pos, 0xf);
1600 
1601    return b.shader;
1602 }
1603 
1604 static nir_shader *
get_texel_buffer_copy_gs()1605 get_texel_buffer_copy_gs()
1606 {
1607    /* FIXME: this creates a geometry shader that takes the index of a single
1608     * layer to clear from push constants, so we need to emit a draw call for
1609     * each layer that we want to clear. We could actually do better and have it
1610     * take a range of layers however, if we were to do this, we would need to
1611     * be careful not to exceed the maximum number of output vertices allowed in
1612     * a geometry shader.
1613     */
1614    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1615    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
1616                                                   "meta texel buffer copy gs");
1617    nir_shader *nir = b.shader;
1618    nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
1619    nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
1620                                (1ull << VARYING_SLOT_LAYER);
1621    nir->info.gs.input_primitive = GL_TRIANGLES;
1622    nir->info.gs.output_primitive = GL_TRIANGLE_STRIP;
1623    nir->info.gs.vertices_in = 3;
1624    nir->info.gs.vertices_out = 3;
1625    nir->info.gs.invocations = 1;
1626    nir->info.gs.active_stream_mask = 0x1;
1627 
1628    /* in vec4 gl_Position[3] */
1629    nir_variable *gs_in_pos =
1630       nir_variable_create(b.shader, nir_var_shader_in,
1631                           glsl_array_type(glsl_vec4_type(), 3, 0),
1632                           "in_gl_Position");
1633    gs_in_pos->data.location = VARYING_SLOT_POS;
1634 
1635    /* out vec4 gl_Position */
1636    nir_variable *gs_out_pos =
1637       nir_variable_create(b.shader, nir_var_shader_out, glsl_vec4_type(),
1638                           "out_gl_Position");
1639    gs_out_pos->data.location = VARYING_SLOT_POS;
1640 
1641    /* out float gl_Layer */
1642    nir_variable *gs_out_layer =
1643       nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(),
1644                           "out_gl_Layer");
1645    gs_out_layer->data.location = VARYING_SLOT_LAYER;
1646 
1647    /* Emit output triangle */
1648    for (uint32_t i = 0; i < 3; i++) {
1649       /* gl_Position from shader input */
1650       nir_deref_instr *in_pos_i =
1651          nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gs_in_pos), i);
1652       nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
1653 
1654       /* gl_Layer from push constants */
1655       nir_ssa_def *layer =
1656          nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
1657                                 .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET,
1658                                 .range = 4);
1659       nir_store_var(&b, gs_out_layer, layer, 0x1);
1660 
1661       nir_emit_vertex(&b, 0);
1662    }
1663 
1664    nir_end_primitive(&b, 0);
1665 
1666    return nir;
1667 }
1668 
1669 static nir_ssa_def *
load_frag_coord(nir_builder * b)1670 load_frag_coord(nir_builder *b)
1671 {
1672    nir_foreach_shader_in_variable(var, b->shader) {
1673       if (var->data.location == VARYING_SLOT_POS)
1674          return nir_load_var(b, var);
1675    }
1676    nir_variable *pos = nir_variable_create(b->shader, nir_var_shader_in,
1677                                            glsl_vec4_type(), NULL);
1678    pos->data.location = VARYING_SLOT_POS;
1679    return nir_load_var(b, pos);
1680 }
1681 
1682 static uint32_t
component_swizzle_to_nir_swizzle(VkComponentSwizzle comp,VkComponentSwizzle swz)1683 component_swizzle_to_nir_swizzle(VkComponentSwizzle comp, VkComponentSwizzle swz)
1684 {
1685    if (swz == VK_COMPONENT_SWIZZLE_IDENTITY)
1686       swz = comp;
1687 
1688    switch (swz) {
1689    case VK_COMPONENT_SWIZZLE_R:
1690       return 0;
1691    case VK_COMPONENT_SWIZZLE_G:
1692       return 1;
1693    case VK_COMPONENT_SWIZZLE_B:
1694       return 2;
1695    case VK_COMPONENT_SWIZZLE_A:
1696       return 3;
1697    default:
1698       unreachable("Invalid swizzle");
1699    };
1700 }
1701 
1702 static nir_shader *
get_texel_buffer_copy_fs(struct v3dv_device * device,VkFormat format,VkComponentMapping * cswizzle)1703 get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
1704                          VkComponentMapping *cswizzle)
1705 {
1706    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1707    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
1708                                                   "meta texel buffer copy fs");
1709 
1710    /* We only use the copy from texel buffer shader to implement
1711     * copy_buffer_to_image_shader, which always selects a compatible integer
1712     * format for the copy.
1713     */
1714    assert(vk_format_is_int(format));
1715 
1716    /* Fragment shader output color */
1717    nir_variable *fs_out_color =
1718       nir_variable_create(b.shader, nir_var_shader_out,
1719                           glsl_uvec4_type(), "out_color");
1720    fs_out_color->data.location = FRAG_RESULT_DATA0;
1721 
1722    /* Texel buffer input */
1723    const struct glsl_type *sampler_type =
1724       glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, false, false, GLSL_TYPE_UINT);
1725    nir_variable *sampler =
1726       nir_variable_create(b.shader, nir_var_uniform, sampler_type, "texel_buf");
1727    sampler->data.descriptor_set = 0;
1728    sampler->data.binding = 0;
1729 
1730    /* Load the box describing the pixel region we want to copy from the
1731     * texel buffer.
1732     */
1733    nir_ssa_def *box =
1734       nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0),
1735                              .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET,
1736                              .range = 16);
1737 
1738    /* Load the buffer stride (this comes in texel units) */
1739    nir_ssa_def *stride =
1740       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
1741                              .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET,
1742                              .range = 4);
1743 
1744    /* Load the buffer offset (this comes in texel units) */
1745    nir_ssa_def *offset =
1746       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
1747                              .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET,
1748                              .range = 4);
1749 
1750    nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b));
1751 
1752    /* Load pixel data from texel buffer based on the x,y offset of the pixel
1753     * within the box. Texel buffers are 1D arrays of texels.
1754     *
1755     * Notice that we already make sure that we only generate fragments that are
1756     * inside the box through the scissor/viewport state, so our offset into the
1757     * texel buffer should always be within its bounds and we we don't need
1758     * to add a check for that here.
1759     */
1760    nir_ssa_def *x_offset =
1761       nir_isub(&b, nir_channel(&b, coord, 0),
1762                    nir_channel(&b, box, 0));
1763    nir_ssa_def *y_offset =
1764       nir_isub(&b, nir_channel(&b, coord, 1),
1765                    nir_channel(&b, box, 1));
1766    nir_ssa_def *texel_offset =
1767       nir_iadd(&b, nir_iadd(&b, offset, x_offset),
1768                    nir_imul(&b, y_offset, stride));
1769 
1770    nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
1771    nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
1772    tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
1773    tex->op = nir_texop_txf;
1774    tex->src[0].src_type = nir_tex_src_coord;
1775    tex->src[0].src = nir_src_for_ssa(texel_offset);
1776    tex->src[1].src_type = nir_tex_src_texture_deref;
1777    tex->src[1].src = nir_src_for_ssa(tex_deref);
1778    tex->dest_type = nir_type_uint32;
1779    tex->is_array = false;
1780    tex->coord_components = 1;
1781    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "texel buffer result");
1782    nir_builder_instr_insert(&b, &tex->instr);
1783 
1784    uint32_t swiz[4];
1785    swiz[0] =
1786       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_R, cswizzle->r);
1787    swiz[1] =
1788       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_G, cswizzle->g);
1789    swiz[2] =
1790       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b);
1791    swiz[3] =
1792       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a);
1793    nir_ssa_def *s = nir_swizzle(&b, &tex->dest.ssa, swiz, 4);
1794    nir_store_var(&b, fs_out_color, s, 0xf);
1795 
1796    return b.shader;
1797 }
1798 
1799 static bool
create_texel_buffer_copy_pipeline(struct v3dv_device * device,VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,bool is_layered,VkRenderPass _pass,VkPipelineLayout pipeline_layout,VkPipeline * pipeline)1800 create_texel_buffer_copy_pipeline(struct v3dv_device *device,
1801                                   VkFormat format,
1802                                   VkColorComponentFlags cmask,
1803                                   VkComponentMapping *cswizzle,
1804                                   bool is_layered,
1805                                   VkRenderPass _pass,
1806                                   VkPipelineLayout pipeline_layout,
1807                                   VkPipeline *pipeline)
1808 {
1809    struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
1810 
1811    assert(vk_format_is_color(format));
1812 
1813    nir_shader *vs_nir = get_texel_buffer_copy_vs();
1814    nir_shader *fs_nir = get_texel_buffer_copy_fs(device, format, cswizzle);
1815    nir_shader *gs_nir = is_layered ? get_texel_buffer_copy_gs() : NULL;
1816 
1817    const VkPipelineVertexInputStateCreateInfo vi_state = {
1818       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
1819       .vertexBindingDescriptionCount = 0,
1820       .vertexAttributeDescriptionCount = 0,
1821    };
1822 
1823    VkPipelineDepthStencilStateCreateInfo ds_state = {
1824       .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
1825    };
1826 
1827    VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
1828    blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
1829       .blendEnable = false,
1830       .colorWriteMask = cmask,
1831    };
1832 
1833    const VkPipelineColorBlendStateCreateInfo cb_state = {
1834       .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
1835       .logicOpEnable = false,
1836       .attachmentCount = 1,
1837       .pAttachments = blend_att_state
1838    };
1839 
1840    const VkPipelineMultisampleStateCreateInfo ms_state = {
1841       .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
1842       .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
1843       .sampleShadingEnable = false,
1844       .pSampleMask = NULL,
1845       .alphaToCoverageEnable = false,
1846       .alphaToOneEnable = false,
1847    };
1848 
1849    return create_pipeline(device,
1850                           pass,
1851                           vs_nir, gs_nir, fs_nir,
1852                           &vi_state,
1853                           &ds_state,
1854                           &cb_state,
1855                           &ms_state,
1856                           pipeline_layout,
1857                           pipeline);
1858 }
1859 
1860 static bool
get_copy_texel_buffer_pipeline(struct v3dv_device * device,VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,VkImageType image_type,bool is_layered,struct v3dv_meta_texel_buffer_copy_pipeline ** pipeline)1861 get_copy_texel_buffer_pipeline(
1862    struct v3dv_device *device,
1863    VkFormat format,
1864    VkColorComponentFlags cmask,
1865    VkComponentMapping *cswizzle,
1866    VkImageType image_type,
1867    bool is_layered,
1868    struct v3dv_meta_texel_buffer_copy_pipeline **pipeline)
1869 {
1870    bool ok = true;
1871 
1872    uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
1873    get_texel_buffer_copy_pipeline_cache_key(format, cmask, cswizzle, is_layered,
1874                                             key);
1875 
1876    mtx_lock(&device->meta.mtx);
1877    struct hash_entry *entry =
1878       _mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type],
1879                               &key);
1880    if (entry) {
1881       mtx_unlock(&device->meta.mtx);
1882       *pipeline = entry->data;
1883       return true;
1884    }
1885 
1886    *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
1887                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1888 
1889    if (*pipeline == NULL)
1890       goto fail;
1891 
1892    /* The blit render pass is compatible */
1893    ok = create_blit_render_pass(device, format, format,
1894                                 &(*pipeline)->pass,
1895                                 &(*pipeline)->pass_no_load);
1896    if (!ok)
1897       goto fail;
1898 
1899    ok =
1900       create_texel_buffer_copy_pipeline(device,
1901                                         format, cmask, cswizzle, is_layered,
1902                                         (*pipeline)->pass,
1903                                         device->meta.texel_buffer_copy.p_layout,
1904                                         &(*pipeline)->pipeline);
1905    if (!ok)
1906       goto fail;
1907 
1908    _mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type],
1909                            &key, *pipeline);
1910 
1911    mtx_unlock(&device->meta.mtx);
1912    return true;
1913 
1914 fail:
1915    mtx_unlock(&device->meta.mtx);
1916 
1917    VkDevice _device = v3dv_device_to_handle(device);
1918    if (*pipeline) {
1919       if ((*pipeline)->pass)
1920          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
1921       if ((*pipeline)->pipeline)
1922          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
1923       vk_free(&device->vk.alloc, *pipeline);
1924       *pipeline = NULL;
1925    }
1926 
1927    return false;
1928 }
1929 
1930 static bool
texel_buffer_shader_copy(struct v3dv_cmd_buffer * cmd_buffer,VkImageAspectFlags aspect,struct v3dv_image * image,VkFormat dst_format,VkFormat src_format,struct v3dv_buffer * buffer,uint32_t buffer_bpp,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,uint32_t region_count,const VkBufferImageCopy2KHR * regions)1931 texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
1932                          VkImageAspectFlags aspect,
1933                          struct v3dv_image *image,
1934                          VkFormat dst_format,
1935                          VkFormat src_format,
1936                          struct v3dv_buffer *buffer,
1937                          uint32_t buffer_bpp,
1938                          VkColorComponentFlags cmask,
1939                          VkComponentMapping *cswizzle,
1940                          uint32_t region_count,
1941                          const VkBufferImageCopy2KHR *regions)
1942 {
1943    VkResult result;
1944    bool handled = false;
1945 
1946    assert(cswizzle);
1947 
1948    /* This is a copy path, so we don't handle format conversions. The only
1949     * exception are stencil to D24S8 copies, which are handled as a color
1950     * masked R8->RGBA8 copy.
1951     */
1952    assert(src_format == dst_format ||
1953           (dst_format == VK_FORMAT_R8G8B8A8_UINT &&
1954            src_format == VK_FORMAT_R8_UINT &&
1955            cmask == VK_COLOR_COMPONENT_R_BIT));
1956 
1957    /* We only handle color copies. Callers can copy D/S aspects by using
1958     * a compatible color format and maybe a cmask/cswizzle for D24 formats.
1959     */
1960    if (aspect != VK_IMAGE_ASPECT_COLOR_BIT)
1961       return handled;
1962 
1963    /* FIXME: we only handle uncompressed images for now. */
1964    if (vk_format_is_compressed(image->vk.format))
1965       return handled;
1966 
1967    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
1968                                             VK_COLOR_COMPONENT_G_BIT |
1969                                             VK_COLOR_COMPONENT_B_BIT |
1970                                             VK_COLOR_COMPONENT_A_BIT;
1971    if (cmask == 0)
1972       cmask = full_cmask;
1973 
1974    /* The buffer needs to have VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT
1975     * so we can bind it as a texel buffer. Otherwise, the buffer view
1976     * we create below won't setup the texture state that we need for this.
1977     */
1978    if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
1979       if (v3dv_buffer_format_supports_features(
1980              cmd_buffer->device, src_format,
1981              VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT)) {
1982          buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
1983       } else {
1984          return handled;
1985       }
1986    }
1987 
1988    /* At this point we should be able to handle the copy unless an unexpected
1989     * error occurs, such as an OOM.
1990     */
1991    handled = true;
1992 
1993 
1994    /* Compute the number of layers to copy.
1995     *
1996     * If we are batching (region_count > 1) all our regions have the same
1997     * image subresource so we can take this from the first region. For 3D
1998     * images we require the same depth extent.
1999     */
2000    const VkImageSubresourceLayers *resource = &regions[0].imageSubresource;
2001    uint32_t num_layers;
2002    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
2003       num_layers = resource->layerCount;
2004    } else {
2005       assert(region_count == 1);
2006       num_layers = regions[0].imageExtent.depth;
2007    }
2008    assert(num_layers > 0);
2009 
2010    /* Get the texel buffer copy pipeline */
2011    struct v3dv_meta_texel_buffer_copy_pipeline *pipeline = NULL;
2012    bool ok = get_copy_texel_buffer_pipeline(cmd_buffer->device,
2013                                             dst_format, cmask, cswizzle,
2014                                             image->vk.image_type, num_layers > 1,
2015                                             &pipeline);
2016    if (!ok)
2017       return handled;
2018    assert(pipeline && pipeline->pipeline && pipeline->pass);
2019 
2020    /* Setup descriptor set for the source texel buffer. We don't have to
2021     * register the descriptor as a private command buffer object since
2022     * all descriptors will be freed automatically with the descriptor
2023     * pool.
2024     */
2025    VkDescriptorSet set;
2026    result = allocate_texel_buffer_copy_descriptor_set(cmd_buffer, &set);
2027    if (result != VK_SUCCESS)
2028       return handled;
2029 
2030    /* FIXME: for some reason passing region->bufferOffset here for the
2031     * offset field doesn't work, making the following CTS tests fail:
2032     *
2033     * dEQP-VK.api.copy_and_blit.core.buffer_to_image.*buffer_offset*
2034     *
2035     * So instead we pass 0 here and we pass the offset in texels as a push
2036     * constant to the shader, which seems to work correctly.
2037     */
2038    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
2039    VkBufferViewCreateInfo buffer_view_info = {
2040       .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
2041       .buffer = v3dv_buffer_to_handle(buffer),
2042       .format = src_format,
2043       .offset = 0,
2044       .range = VK_WHOLE_SIZE,
2045    };
2046 
2047    VkBufferView texel_buffer_view;
2048    result = v3dv_CreateBufferView(_device, &buffer_view_info,
2049                                   &cmd_buffer->device->vk.alloc,
2050                                   &texel_buffer_view);
2051    if (result != VK_SUCCESS)
2052       return handled;
2053 
2054    v3dv_cmd_buffer_add_private_obj(
2055       cmd_buffer, (uintptr_t)texel_buffer_view,
2056       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyBufferView);
2057 
2058    VkWriteDescriptorSet write = {
2059       .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
2060       .dstSet = set,
2061       .dstBinding = 0,
2062       .dstArrayElement = 0,
2063       .descriptorCount = 1,
2064       .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
2065       .pTexelBufferView = &texel_buffer_view,
2066    };
2067    v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
2068 
2069    /* Push command buffer state before starting meta operation */
2070    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
2071    uint32_t dirty_dynamic_state = 0;
2072 
2073    /* Bind common state for all layers and regions  */
2074    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
2075    v3dv_CmdBindPipeline(_cmd_buffer,
2076                         VK_PIPELINE_BIND_POINT_GRAPHICS,
2077                         pipeline->pipeline);
2078 
2079    v3dv_CmdBindDescriptorSets(_cmd_buffer,
2080                               VK_PIPELINE_BIND_POINT_GRAPHICS,
2081                               cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2082                               0, 1, &set,
2083                               0, NULL);
2084 
2085    /* Setup framebuffer.
2086     *
2087     * For 3D images, this creates a layered framebuffer with a number of
2088     * layers matching the depth extent of the 3D image.
2089     */
2090    uint32_t fb_width = u_minify(image->vk.extent.width, resource->mipLevel);
2091    uint32_t fb_height = u_minify(image->vk.extent.height, resource->mipLevel);
2092    VkImageViewCreateInfo image_view_info = {
2093       .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2094       .image = v3dv_image_to_handle(image),
2095       .viewType = v3dv_image_type_to_view_type(image->vk.image_type),
2096       .format = dst_format,
2097       .subresourceRange = {
2098          .aspectMask = aspect,
2099          .baseMipLevel = resource->mipLevel,
2100          .levelCount = 1,
2101          .baseArrayLayer = resource->baseArrayLayer,
2102          .layerCount = num_layers,
2103       },
2104    };
2105    VkImageView image_view;
2106    result = v3dv_CreateImageView(_device, &image_view_info,
2107                                  &cmd_buffer->device->vk.alloc, &image_view);
2108    if (result != VK_SUCCESS)
2109       goto fail;
2110 
2111    v3dv_cmd_buffer_add_private_obj(
2112       cmd_buffer, (uintptr_t)image_view,
2113       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
2114 
2115    VkFramebufferCreateInfo fb_info = {
2116       .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
2117       .renderPass = pipeline->pass,
2118       .attachmentCount = 1,
2119       .pAttachments = &image_view,
2120       .width = fb_width,
2121       .height = fb_height,
2122       .layers = num_layers,
2123    };
2124 
2125    VkFramebuffer fb;
2126    result = v3dv_CreateFramebuffer(_device, &fb_info,
2127                                    &cmd_buffer->device->vk.alloc, &fb);
2128    if (result != VK_SUCCESS)
2129       goto fail;
2130 
2131     v3dv_cmd_buffer_add_private_obj(
2132        cmd_buffer, (uintptr_t)fb,
2133        (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
2134 
2135    /* For each layer */
2136    for (uint32_t l = 0; l < num_layers; l++) {
2137        /* Start render pass for this layer.
2138         *
2139         * If the we only have one region to copy, then we might be able to
2140         * skip the TLB load if it is aligned to tile boundaries. All layers
2141         * copy the same area, so we only need to check this once.
2142         */
2143       bool can_skip_tlb_load = false;
2144       VkRect2D render_area;
2145       if (region_count == 1) {
2146          render_area.offset.x = regions[0].imageOffset.x;
2147          render_area.offset.y = regions[0].imageOffset.y;
2148          render_area.extent.width = regions[0].imageExtent.width;
2149          render_area.extent.height = regions[0].imageExtent.height;
2150 
2151          if (l == 0) {
2152             struct v3dv_render_pass *pipeline_pass =
2153                v3dv_render_pass_from_handle(pipeline->pass);
2154             can_skip_tlb_load =
2155                cmask == full_cmask &&
2156                v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
2157                                                  v3dv_framebuffer_from_handle(fb),
2158                                                  pipeline_pass, 0);
2159          }
2160       } else {
2161          render_area.offset.x = 0;
2162          render_area.offset.y = 0;
2163          render_area.extent.width = fb_width;
2164          render_area.extent.height = fb_height;
2165       }
2166 
2167       VkRenderPassBeginInfo rp_info = {
2168          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
2169          .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
2170                                            pipeline->pass,
2171          .framebuffer = fb,
2172          .renderArea = render_area,
2173          .clearValueCount = 0,
2174       };
2175 
2176       v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
2177       struct v3dv_job *job = cmd_buffer->state.job;
2178       if (!job)
2179          goto fail;
2180 
2181       /* If we are using a layered copy we need to specify the layer for the
2182        * Geometry Shader.
2183        */
2184       if (num_layers > 1) {
2185          uint32_t layer = resource->baseArrayLayer + l;
2186          v3dv_CmdPushConstants(_cmd_buffer,
2187                                cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2188                                VK_SHADER_STAGE_GEOMETRY_BIT,
2189                                24, 4, &layer);
2190       }
2191 
2192       /* For each region */
2193       dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
2194       for (uint32_t r = 0; r < region_count; r++) {
2195          const VkBufferImageCopy2KHR *region = &regions[r];
2196 
2197          /* Obtain the 2D buffer region spec */
2198          uint32_t buf_width, buf_height;
2199          if (region->bufferRowLength == 0)
2200              buf_width = region->imageExtent.width;
2201          else
2202              buf_width = region->bufferRowLength;
2203 
2204          if (region->bufferImageHeight == 0)
2205              buf_height = region->imageExtent.height;
2206          else
2207              buf_height = region->bufferImageHeight;
2208 
2209          const VkViewport viewport = {
2210             .x = region->imageOffset.x,
2211             .y = region->imageOffset.y,
2212             .width = region->imageExtent.width,
2213             .height = region->imageExtent.height,
2214             .minDepth = 0.0f,
2215             .maxDepth = 1.0f
2216          };
2217          v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
2218          const VkRect2D scissor = {
2219             .offset = { region->imageOffset.x, region->imageOffset.y },
2220             .extent = { region->imageExtent.width, region->imageExtent.height }
2221          };
2222          v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
2223 
2224          const VkDeviceSize buf_offset =
2225             region->bufferOffset / buffer_bpp  + l * buf_height * buf_width;
2226          uint32_t push_data[6] = {
2227             region->imageOffset.x,
2228             region->imageOffset.y,
2229             region->imageOffset.x + region->imageExtent.width - 1,
2230             region->imageOffset.y + region->imageExtent.height - 1,
2231             buf_width,
2232             buf_offset,
2233          };
2234 
2235          v3dv_CmdPushConstants(_cmd_buffer,
2236                                cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2237                                VK_SHADER_STAGE_FRAGMENT_BIT,
2238                                0, sizeof(push_data), &push_data);
2239 
2240          v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
2241       } /* For each region */
2242 
2243       v3dv_CmdEndRenderPass(_cmd_buffer);
2244    } /* For each layer */
2245 
2246 fail:
2247    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
2248    return handled;
2249 }
2250 
2251 /**
2252  * Returns true if the implementation supports the requested operation (even if
2253  * it failed to process it, for example, due to an out-of-memory error).
2254  */
2255 static bool
copy_buffer_to_image_blit(struct v3dv_cmd_buffer * cmd_buffer,VkImageAspectFlags aspect,struct v3dv_image * image,VkFormat dst_format,VkFormat src_format,struct v3dv_buffer * buffer,uint32_t buffer_bpp,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,uint32_t region_count,const VkBufferImageCopy2KHR * regions)2256 copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
2257                           VkImageAspectFlags aspect,
2258                           struct v3dv_image *image,
2259                           VkFormat dst_format,
2260                           VkFormat src_format,
2261                           struct v3dv_buffer *buffer,
2262                           uint32_t buffer_bpp,
2263                           VkColorComponentFlags cmask,
2264                           VkComponentMapping *cswizzle,
2265                           uint32_t region_count,
2266                           const VkBufferImageCopy2KHR *regions)
2267 {
2268    /* Since we can't sample linear images we need to upload the linear
2269     * buffer to a tiled image that we can use as a blit source, which
2270     * is slow.
2271     */
2272    perf_debug("Falling back to blit path for buffer to image copy.\n");
2273 
2274    struct v3dv_device *device = cmd_buffer->device;
2275    VkDevice _device = v3dv_device_to_handle(device);
2276    bool handled = true;
2277 
2278    /* Allocate memory for the tiled image. Since we copy layer by layer
2279     * we allocate memory to hold a full layer, which is the worse case.
2280     * For that we create a dummy image with that spec, get memory requirements
2281     * for it and use that information to create the memory allocation.
2282     * We will then reuse this memory store for all the regions we want to
2283     * copy.
2284     */
2285    VkImage dummy_image;
2286    VkImageCreateInfo dummy_info = {
2287       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
2288       .imageType = VK_IMAGE_TYPE_2D,
2289       .format = src_format,
2290       .extent = { image->vk.extent.width, image->vk.extent.height, 1 },
2291       .mipLevels = 1,
2292       .arrayLayers = 1,
2293       .samples = VK_SAMPLE_COUNT_1_BIT,
2294       .tiling = VK_IMAGE_TILING_OPTIMAL,
2295       .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
2296                VK_IMAGE_USAGE_TRANSFER_DST_BIT,
2297       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
2298       .queueFamilyIndexCount = 0,
2299       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
2300    };
2301    VkResult result =
2302       v3dv_CreateImage(_device, &dummy_info, &device->vk.alloc, &dummy_image);
2303    if (result != VK_SUCCESS)
2304       return handled;
2305 
2306    VkMemoryRequirements reqs;
2307    vk_common_GetImageMemoryRequirements(_device, dummy_image, &reqs);
2308    v3dv_DestroyImage(_device, dummy_image, &device->vk.alloc);
2309 
2310    VkDeviceMemory mem;
2311    VkMemoryAllocateInfo alloc_info = {
2312       .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
2313       .allocationSize = reqs.size,
2314       .memoryTypeIndex = 0,
2315    };
2316    result = v3dv_AllocateMemory(_device, &alloc_info, &device->vk.alloc, &mem);
2317    if (result != VK_SUCCESS)
2318       return handled;
2319 
2320    v3dv_cmd_buffer_add_private_obj(
2321       cmd_buffer, (uintptr_t)mem,
2322       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_FreeMemory);
2323 
2324    /* Obtain the layer count.
2325     *
2326     * If we are batching (region_count > 1) all our regions have the same
2327     * image subresource so we can take this from the first region.
2328     */
2329    uint32_t num_layers;
2330    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
2331       num_layers = regions[0].imageSubresource.layerCount;
2332    else
2333       num_layers = regions[0].imageExtent.depth;
2334    assert(num_layers > 0);
2335 
2336    /* Sanity check: we can only batch multiple regions together if they have
2337     * the same framebuffer (so the same layer).
2338     */
2339    assert(num_layers == 1 || region_count == 1);
2340 
2341    const uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
2342    const uint32_t block_height = vk_format_get_blockheight(image->vk.format);
2343 
2344    /* Copy regions by uploading each region to a temporary tiled image using
2345     * the memory we have just allocated as storage.
2346     */
2347    for (uint32_t r = 0; r < region_count; r++) {
2348       const VkBufferImageCopy2KHR *region = &regions[r];
2349 
2350       /* Obtain the 2D buffer region spec */
2351       uint32_t buf_width, buf_height;
2352       if (region->bufferRowLength == 0)
2353           buf_width = region->imageExtent.width;
2354       else
2355           buf_width = region->bufferRowLength;
2356 
2357       if (region->bufferImageHeight == 0)
2358           buf_height = region->imageExtent.height;
2359       else
2360           buf_height = region->bufferImageHeight;
2361 
2362       /* If the image is compressed, the bpp refers to blocks, not pixels */
2363       buf_width = buf_width / block_width;
2364       buf_height = buf_height / block_height;
2365 
2366       for (uint32_t i = 0; i < num_layers; i++) {
2367          /* Create the tiled image */
2368          VkImageCreateInfo image_info = {
2369             .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
2370             .imageType = VK_IMAGE_TYPE_2D,
2371             .format = src_format,
2372             .extent = { buf_width, buf_height, 1 },
2373             .mipLevels = 1,
2374             .arrayLayers = 1,
2375             .samples = VK_SAMPLE_COUNT_1_BIT,
2376             .tiling = VK_IMAGE_TILING_OPTIMAL,
2377             .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
2378                      VK_IMAGE_USAGE_TRANSFER_DST_BIT,
2379             .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
2380             .queueFamilyIndexCount = 0,
2381             .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
2382          };
2383 
2384          VkImage buffer_image;
2385          VkResult result =
2386             v3dv_CreateImage(_device, &image_info, &device->vk.alloc,
2387                              &buffer_image);
2388          if (result != VK_SUCCESS)
2389             return handled;
2390 
2391          v3dv_cmd_buffer_add_private_obj(
2392             cmd_buffer, (uintptr_t)buffer_image,
2393             (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
2394 
2395          result = vk_common_BindImageMemory(_device, buffer_image, mem, 0);
2396          if (result != VK_SUCCESS)
2397             return handled;
2398 
2399          /* Upload buffer contents for the selected layer */
2400          const VkDeviceSize buf_offset_bytes =
2401             region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
2402          const VkBufferImageCopy2KHR buffer_image_copy = {
2403             .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2_KHR,
2404             .bufferOffset = buf_offset_bytes,
2405             .bufferRowLength = region->bufferRowLength / block_width,
2406             .bufferImageHeight = region->bufferImageHeight / block_height,
2407             .imageSubresource = {
2408                .aspectMask = aspect,
2409                .mipLevel = 0,
2410                .baseArrayLayer = 0,
2411                .layerCount = 1,
2412             },
2413             .imageOffset = { 0, 0, 0 },
2414             .imageExtent = { buf_width, buf_height, 1 }
2415          };
2416          handled =
2417             create_tiled_image_from_buffer(cmd_buffer,
2418                                            v3dv_image_from_handle(buffer_image),
2419                                            buffer, &buffer_image_copy);
2420          if (!handled) {
2421             /* This is unexpected, we should have setup the upload to be
2422              * conformant to a TFU or TLB copy.
2423              */
2424             unreachable("Unable to copy buffer to image through TLB");
2425             return false;
2426          }
2427 
2428          /* Blit-copy the requested image extent from the buffer image to the
2429           * destination image.
2430           *
2431           * Since we are copying, the blit must use the same format on the
2432           * destination and source images to avoid format conversions. The
2433           * only exception is copying stencil, which we upload to a R8UI source
2434           * image, but that we need to blit to a S8D24 destination (the only
2435           * stencil format we support).
2436           */
2437          const VkImageBlit2KHR blit_region = {
2438             .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
2439             .srcSubresource = {
2440                .aspectMask = aspect,
2441                .mipLevel = 0,
2442                .baseArrayLayer = 0,
2443                .layerCount = 1,
2444             },
2445             .srcOffsets = {
2446                { 0, 0, 0 },
2447                { region->imageExtent.width, region->imageExtent.height, 1 },
2448             },
2449             .dstSubresource = {
2450                .aspectMask = aspect,
2451                .mipLevel = region->imageSubresource.mipLevel,
2452                .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
2453                .layerCount = 1,
2454             },
2455             .dstOffsets = {
2456                {
2457                   DIV_ROUND_UP(region->imageOffset.x, block_width),
2458                   DIV_ROUND_UP(region->imageOffset.y, block_height),
2459                   region->imageOffset.z + i,
2460                },
2461                {
2462                   DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
2463                                block_width),
2464                   DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
2465                                block_height),
2466                   region->imageOffset.z + i + 1,
2467                },
2468             },
2469          };
2470 
2471          handled = blit_shader(cmd_buffer,
2472                                image, dst_format,
2473                                v3dv_image_from_handle(buffer_image), src_format,
2474                                cmask, cswizzle,
2475                                &blit_region, VK_FILTER_NEAREST, true);
2476          if (!handled) {
2477             /* This is unexpected, we should have a supported blit spec */
2478             unreachable("Unable to blit buffer to destination image");
2479             return false;
2480          }
2481       }
2482    }
2483 
2484    return handled;
2485 }
2486 
2487 /**
2488  * Returns true if the implementation supports the requested operation (even if
2489  * it failed to process it, for example, due to an out-of-memory error).
2490  */
2491 static bool
copy_buffer_to_image_shader(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,uint32_t region_count,const VkBufferImageCopy2KHR * regions,bool use_texel_buffer)2492 copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
2493                             struct v3dv_image *image,
2494                             struct v3dv_buffer *buffer,
2495                             uint32_t region_count,
2496                             const VkBufferImageCopy2KHR *regions,
2497                             bool use_texel_buffer)
2498 {
2499    /* We can only call this with region_count > 1 if we can batch the regions
2500     * together, in which case they share the same image subresource, and so
2501     * the same aspect.
2502     */
2503    VkImageAspectFlags aspect = regions[0].imageSubresource.aspectMask;
2504 
2505    /* Generally, the bpp of the data in the buffer matches that of the
2506     * destination image. The exception is the case where we are uploading
2507     * stencil (8bpp) to a combined d24s8 image (32bpp).
2508     */
2509    uint32_t buf_bpp = image->cpp;
2510 
2511    /* We are about to upload the buffer data to an image so we can then
2512     * blit that to our destination region. Because we are going to implement
2513     * the copy as a blit, we want our blit source and destination formats to be
2514     * the same (to avoid any format conversions), so we choose a canonical
2515     * format that matches the destination image bpp.
2516     */
2517    VkComponentMapping ident_swizzle = {
2518       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
2519       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
2520       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
2521       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
2522    };
2523 
2524    VkComponentMapping cswizzle = ident_swizzle;
2525    VkColorComponentFlags cmask = 0; /* Write all components */
2526    VkFormat src_format;
2527    VkFormat dst_format;
2528    switch (buf_bpp) {
2529    case 16:
2530       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2531       src_format = VK_FORMAT_R32G32B32A32_UINT;
2532       dst_format = src_format;
2533       break;
2534    case 8:
2535       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2536       src_format = VK_FORMAT_R16G16B16A16_UINT;
2537       dst_format = src_format;
2538       break;
2539    case 4:
2540       switch (aspect) {
2541       case VK_IMAGE_ASPECT_COLOR_BIT:
2542          src_format = VK_FORMAT_R8G8B8A8_UINT;
2543          dst_format = src_format;
2544          break;
2545       case VK_IMAGE_ASPECT_DEPTH_BIT:
2546          assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
2547                 image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
2548                 image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
2549          src_format = VK_FORMAT_R8G8B8A8_UINT;
2550          dst_format = src_format;
2551          aspect = VK_IMAGE_ASPECT_COLOR_BIT;
2552 
2553          /* For D24 formats, the Vulkan spec states that the depth component
2554           * in the buffer is stored in the 24-LSB, but V3D wants it in the
2555           * 24-MSB.
2556           */
2557          if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
2558              image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
2559             cmask = VK_COLOR_COMPONENT_G_BIT |
2560                     VK_COLOR_COMPONENT_B_BIT |
2561                     VK_COLOR_COMPONENT_A_BIT;
2562             cswizzle.r = VK_COMPONENT_SWIZZLE_R;
2563             cswizzle.g = VK_COMPONENT_SWIZZLE_R;
2564             cswizzle.b = VK_COMPONENT_SWIZZLE_G;
2565             cswizzle.a = VK_COMPONENT_SWIZZLE_B;
2566          }
2567          break;
2568       case VK_IMAGE_ASPECT_STENCIL_BIT:
2569          /* Since we don't support separate stencil this is always a stencil
2570           * copy to a combined depth/stencil image. Because we don't support
2571           * separate stencil images, we interpret the buffer data as a
2572           * color R8UI image, and implement the blit as a compatible color
2573           * blit to an RGBA8UI destination masking out writes to components
2574           * GBA (which map to the D24 component of a S8D24 image).
2575           */
2576          assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
2577          buf_bpp = 1;
2578          src_format = VK_FORMAT_R8_UINT;
2579          dst_format = VK_FORMAT_R8G8B8A8_UINT;
2580          cmask = VK_COLOR_COMPONENT_R_BIT;
2581          aspect = VK_IMAGE_ASPECT_COLOR_BIT;
2582          break;
2583       default:
2584          unreachable("unsupported aspect");
2585          return false;
2586       };
2587       break;
2588    case 2:
2589       aspect = VK_IMAGE_ASPECT_COLOR_BIT;
2590       src_format = VK_FORMAT_R16_UINT;
2591       dst_format = src_format;
2592       break;
2593    case 1:
2594       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2595       src_format = VK_FORMAT_R8_UINT;
2596       dst_format = src_format;
2597       break;
2598    default:
2599       unreachable("unsupported bit-size");
2600       return false;
2601    }
2602 
2603    if (use_texel_buffer) {
2604       return texel_buffer_shader_copy(cmd_buffer, aspect, image,
2605                                       dst_format, src_format,
2606                                       buffer, buf_bpp,
2607                                       cmask, &cswizzle,
2608                                       region_count, regions);
2609    } else {
2610       return copy_buffer_to_image_blit(cmd_buffer, aspect, image,
2611                                        dst_format, src_format,
2612                                        buffer, buf_bpp,
2613                                        cmask, &cswizzle,
2614                                        region_count, regions);
2615    }
2616 }
2617 
2618 /**
2619  * Returns true if the implementation supports the requested operation (even if
2620  * it failed to process it, for example, due to an out-of-memory error).
2621  */
2622 static bool
copy_buffer_to_image_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2KHR * region)2623 copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
2624                          struct v3dv_image *image,
2625                          struct v3dv_buffer *buffer,
2626                          const VkBufferImageCopy2KHR *region)
2627 {
2628    /* FIXME */
2629    if (vk_format_is_depth_or_stencil(image->vk.format))
2630       return false;
2631 
2632    if (vk_format_is_compressed(image->vk.format))
2633       return false;
2634 
2635    if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
2636       return false;
2637 
2638    uint32_t buffer_width, buffer_height;
2639    if (region->bufferRowLength == 0)
2640       buffer_width = region->imageExtent.width;
2641    else
2642       buffer_width = region->bufferRowLength;
2643 
2644    if (region->bufferImageHeight == 0)
2645       buffer_height = region->imageExtent.height;
2646    else
2647       buffer_height = region->bufferImageHeight;
2648 
2649    uint32_t buffer_stride = buffer_width * image->cpp;
2650    uint32_t buffer_layer_stride = buffer_stride * buffer_height;
2651 
2652    uint32_t num_layers;
2653    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
2654       num_layers = region->imageSubresource.layerCount;
2655    else
2656       num_layers = region->imageExtent.depth;
2657    assert(num_layers > 0);
2658 
2659    struct v3dv_job *job =
2660       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
2661                                      V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
2662                                      cmd_buffer, -1);
2663    if (!job)
2664       return true;
2665 
2666    job->cpu.copy_buffer_to_image.image = image;
2667    job->cpu.copy_buffer_to_image.buffer = buffer;
2668    job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride;
2669    job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride;
2670    job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset;
2671    job->cpu.copy_buffer_to_image.image_extent = region->imageExtent;
2672    job->cpu.copy_buffer_to_image.image_offset = region->imageOffset;
2673    job->cpu.copy_buffer_to_image.mip_level =
2674       region->imageSubresource.mipLevel;
2675    job->cpu.copy_buffer_to_image.base_layer =
2676       region->imageSubresource.baseArrayLayer;
2677    job->cpu.copy_buffer_to_image.layer_count = num_layers;
2678 
2679    list_addtail(&job->list_link, &cmd_buffer->jobs);
2680 
2681    return true;
2682 }
2683 
2684 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2KHR * info)2685 v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
2686                               const VkCopyBufferToImageInfo2KHR *info)
2687 {
2688    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2689    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer);
2690    V3DV_FROM_HANDLE(v3dv_image, image, info->dstImage);
2691 
2692    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
2693 
2694    uint32_t r = 0;
2695    while (r < info->regionCount) {
2696       /* The TFU and TLB paths can only copy one region at a time and the region
2697        * needs to start at the origin. We try these first for the common case
2698        * where we are copying full images, since they should be the fastest.
2699        */
2700       uint32_t batch_size = 1;
2701       if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, &info->pRegions[r]))
2702          goto handled;
2703 
2704       if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &info->pRegions[r]))
2705          goto handled;
2706 
2707       /* Otherwise, we are copying subrects, so we fallback to copying
2708        * via shader and texel buffers and we try to batch the regions
2709        * if possible. We can only batch copies if they have the same
2710        * framebuffer spec, which is mostly determined by the image
2711        * subresource of the region.
2712        */
2713       const VkImageSubresourceLayers *rsc = &info->pRegions[r].imageSubresource;
2714       for (uint32_t s = r + 1; s < info->regionCount; s++) {
2715          const VkImageSubresourceLayers *rsc_s =
2716             &info->pRegions[s].imageSubresource;
2717 
2718          if (memcmp(rsc, rsc_s, sizeof(VkImageSubresourceLayers)) != 0)
2719             break;
2720 
2721          /* For 3D images we also need to check the depth extent */
2722          if (image->vk.image_type == VK_IMAGE_TYPE_3D &&
2723              info->pRegions[s].imageExtent.depth !=
2724              info->pRegions[r].imageExtent.depth) {
2725                break;
2726          }
2727 
2728          batch_size++;
2729       }
2730 
2731       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
2732                                       batch_size, &info->pRegions[r], true)) {
2733          goto handled;
2734       }
2735 
2736       /* If we still could not copy, fallback to slower paths.
2737        *
2738        * FIXME: we could try to batch these too, but since they are bound to be
2739        * slow it might not be worth it and we should instead put more effort
2740        * in handling more cases with the other paths.
2741        */
2742       if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer,
2743                                    &info->pRegions[r])) {
2744          batch_size = 1;
2745          goto handled;
2746       }
2747 
2748       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
2749                                       batch_size, &info->pRegions[r], false)) {
2750          goto handled;
2751       }
2752 
2753       unreachable("Unsupported buffer to image copy.");
2754 
2755 handled:
2756       r += batch_size;
2757    }
2758 }
2759 
2760 static void
2761 compute_blit_3d_layers(const VkOffset3D *offsets,
2762                        uint32_t *min_layer, uint32_t *max_layer,
2763                        bool *mirror_z);
2764 
2765 /**
2766  * Returns true if the implementation supports the requested operation (even if
2767  * it failed to process it, for example, due to an out-of-memory error).
2768  *
2769  * The TFU blit path doesn't handle scaling so the blit filter parameter can
2770  * be ignored.
2771  */
2772 static bool
blit_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageBlit2KHR * region)2773 blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
2774          struct v3dv_image *dst,
2775          struct v3dv_image *src,
2776          const VkImageBlit2KHR *region)
2777 {
2778    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
2779    assert(src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
2780 
2781    /* Format must match */
2782    if (src->vk.format != dst->vk.format)
2783       return false;
2784 
2785    /* Destination can't be raster format */
2786    if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
2787       return false;
2788 
2789    /* Source region must start at (0,0) */
2790    if (region->srcOffsets[0].x != 0 || region->srcOffsets[0].y != 0)
2791       return false;
2792 
2793    /* Destination image must be complete */
2794    if (region->dstOffsets[0].x != 0 || region->dstOffsets[0].y != 0)
2795       return false;
2796 
2797    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
2798    const uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
2799    const uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
2800    if (region->dstOffsets[1].x < dst_width - 1||
2801        region->dstOffsets[1].y < dst_height - 1) {
2802       return false;
2803    }
2804 
2805    /* No XY scaling */
2806    if (region->srcOffsets[1].x != region->dstOffsets[1].x ||
2807        region->srcOffsets[1].y != region->dstOffsets[1].y) {
2808       return false;
2809    }
2810 
2811    /* If the format is D24S8 both aspects need to be copied, since the TFU
2812     * can't be programmed to copy only one aspect of the image.
2813     */
2814    if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
2815        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
2816                                              VK_IMAGE_ASPECT_STENCIL_BIT;
2817        if (region->dstSubresource.aspectMask != ds_aspects)
2818           return false;
2819    }
2820 
2821    /* Our TFU blits only handle exact copies (it requires same formats
2822     * on input and output, no scaling, etc), so there is no pixel format
2823     * conversions and we can rewrite the format to use one that is TFU
2824     * compatible based on its texel size.
2825     */
2826    const struct v3dv_format *format =
2827       v3dv_get_compatible_tfu_format(cmd_buffer->device,
2828                                      dst->cpp, NULL);
2829 
2830    /* Emit a TFU job for each layer to blit */
2831    assert(region->dstSubresource.layerCount ==
2832           region->srcSubresource.layerCount);
2833 
2834    uint32_t min_dst_layer;
2835    uint32_t max_dst_layer;
2836    bool dst_mirror_z = false;
2837    if (dst->vk.image_type == VK_IMAGE_TYPE_3D) {
2838       compute_blit_3d_layers(region->dstOffsets,
2839                              &min_dst_layer, &max_dst_layer,
2840                              &dst_mirror_z);
2841    } else {
2842       min_dst_layer = region->dstSubresource.baseArrayLayer;
2843       max_dst_layer = min_dst_layer + region->dstSubresource.layerCount;
2844    }
2845 
2846    uint32_t min_src_layer;
2847    uint32_t max_src_layer;
2848    bool src_mirror_z = false;
2849    if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
2850       compute_blit_3d_layers(region->srcOffsets,
2851                              &min_src_layer, &max_src_layer,
2852                              &src_mirror_z);
2853    } else {
2854       min_src_layer = region->srcSubresource.baseArrayLayer;
2855       max_src_layer = min_src_layer + region->srcSubresource.layerCount;
2856    }
2857 
2858    /* No Z scaling for 3D images (for non-3D images both src and dst must
2859     * have the same layerCount).
2860     */
2861    if (max_dst_layer - min_dst_layer != max_src_layer - min_src_layer)
2862       return false;
2863 
2864    const uint32_t layer_count = max_dst_layer - min_dst_layer;
2865    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
2866    for (uint32_t i = 0; i < layer_count; i++) {
2867       /* Since the TFU path doesn't handle scaling, Z mirroring for 3D images
2868        * only involves reversing the order of the slices.
2869        */
2870       const uint32_t dst_layer =
2871          dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i;
2872       const uint32_t src_layer =
2873          src_mirror_z ? max_src_layer - i - 1: min_src_layer + i;
2874       v3dv_X(cmd_buffer->device, meta_emit_tfu_job)
2875          (cmd_buffer, dst, dst_mip_level, dst_layer,
2876           src, src_mip_level, src_layer,
2877           dst_width, dst_height, format);
2878    }
2879 
2880    return true;
2881 }
2882 
2883 static bool
format_needs_software_int_clamp(VkFormat format)2884 format_needs_software_int_clamp(VkFormat format)
2885 {
2886    switch (format) {
2887       case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2888       case VK_FORMAT_A2R10G10B10_SINT_PACK32:
2889       case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2890       case VK_FORMAT_A2B10G10R10_SINT_PACK32:
2891          return true;
2892       default:
2893          return false;
2894    };
2895 }
2896 
2897 static void
get_blit_pipeline_cache_key(VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,uint8_t * key)2898 get_blit_pipeline_cache_key(VkFormat dst_format,
2899                             VkFormat src_format,
2900                             VkColorComponentFlags cmask,
2901                             VkSampleCountFlagBits dst_samples,
2902                             VkSampleCountFlagBits src_samples,
2903                             uint8_t *key)
2904 {
2905    memset(key, 0, V3DV_META_BLIT_CACHE_KEY_SIZE);
2906 
2907    uint32_t *p = (uint32_t *) key;
2908 
2909    *p = dst_format;
2910    p++;
2911 
2912    /* Generally, when blitting from a larger format to a smaller format
2913     * the hardware takes care of clamping the source to the RT range.
2914     * Specifically, for integer formats, this is done by using
2915     * V3D_RENDER_TARGET_CLAMP_INT in the render target setup, however, this
2916     * clamps to the bit-size of the render type, and some formats, such as
2917     * rgb10a2_uint have a 16-bit type, so it won't do what we need and we
2918     * require to clamp in software. In these cases, we need to amend the blit
2919     * shader with clamp code that depends on both the src and dst formats, so
2920     * we need the src format to be part of the key.
2921     */
2922    *p = format_needs_software_int_clamp(dst_format) ? src_format : 0;
2923    p++;
2924 
2925    *p = cmask;
2926    p++;
2927 
2928    *p = (dst_samples << 8) | src_samples;
2929    p++;
2930 
2931    assert(((uint8_t*)p - key) == V3DV_META_BLIT_CACHE_KEY_SIZE);
2932 }
2933 
2934 static bool
create_blit_render_pass(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkRenderPass * pass_load,VkRenderPass * pass_no_load)2935 create_blit_render_pass(struct v3dv_device *device,
2936                         VkFormat dst_format,
2937                         VkFormat src_format,
2938                         VkRenderPass *pass_load,
2939                         VkRenderPass *pass_no_load)
2940 {
2941    const bool is_color_blit = vk_format_is_color(dst_format);
2942 
2943    /* Attachment load operation is specified below */
2944    VkAttachmentDescription att = {
2945       .format = dst_format,
2946       .samples = VK_SAMPLE_COUNT_1_BIT,
2947       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
2948       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
2949       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
2950    };
2951 
2952    VkAttachmentReference att_ref = {
2953       .attachment = 0,
2954       .layout = VK_IMAGE_LAYOUT_GENERAL,
2955    };
2956 
2957    VkSubpassDescription subpass = {
2958       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
2959       .inputAttachmentCount = 0,
2960       .colorAttachmentCount = is_color_blit ? 1 : 0,
2961       .pColorAttachments = is_color_blit ? &att_ref : NULL,
2962       .pResolveAttachments = NULL,
2963       .pDepthStencilAttachment = is_color_blit ? NULL : &att_ref,
2964       .preserveAttachmentCount = 0,
2965       .pPreserveAttachments = NULL,
2966    };
2967 
2968    VkRenderPassCreateInfo info = {
2969       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
2970       .attachmentCount = 1,
2971       .pAttachments = &att,
2972       .subpassCount = 1,
2973       .pSubpasses = &subpass,
2974       .dependencyCount = 0,
2975       .pDependencies = NULL,
2976    };
2977 
2978    VkResult result;
2979    att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
2980    result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
2981                                   &info, &device->vk.alloc, pass_load);
2982    if (result != VK_SUCCESS)
2983       return false;
2984 
2985    att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
2986    result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
2987                                   &info, &device->vk.alloc, pass_no_load);
2988    return result == VK_SUCCESS;
2989 }
2990 
2991 static nir_ssa_def *
gen_rect_vertices(nir_builder * b)2992 gen_rect_vertices(nir_builder *b)
2993 {
2994    nir_ssa_def *vertex_id = nir_load_vertex_id(b);
2995 
2996    /* vertex 0: -1.0, -1.0
2997     * vertex 1: -1.0,  1.0
2998     * vertex 2:  1.0, -1.0
2999     * vertex 3:  1.0,  1.0
3000     *
3001     * so:
3002     *
3003     * channel 0 is vertex_id < 2 ? -1.0 :  1.0
3004     * channel 1 is vertex id & 1 ?  1.0 : -1.0
3005     */
3006 
3007    nir_ssa_def *one = nir_imm_int(b, 1);
3008    nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
3009    nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
3010 
3011    nir_ssa_def *comp[4];
3012    comp[0] = nir_bcsel(b, c0cmp,
3013                        nir_imm_float(b, -1.0f),
3014                        nir_imm_float(b, 1.0f));
3015 
3016    comp[1] = nir_bcsel(b, c1cmp,
3017                        nir_imm_float(b, 1.0f),
3018                        nir_imm_float(b, -1.0f));
3019    comp[2] = nir_imm_float(b, 0.0f);
3020    comp[3] = nir_imm_float(b, 1.0f);
3021    return nir_vec(b, comp, 4);
3022 }
3023 
3024 static nir_ssa_def *
gen_tex_coords(nir_builder * b)3025 gen_tex_coords(nir_builder *b)
3026 {
3027    nir_ssa_def *tex_box =
3028       nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16);
3029 
3030    nir_ssa_def *tex_z =
3031       nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4);
3032 
3033    nir_ssa_def *vertex_id = nir_load_vertex_id(b);
3034 
3035    /* vertex 0: src0_x, src0_y
3036     * vertex 1: src0_x, src1_y
3037     * vertex 2: src1_x, src0_y
3038     * vertex 3: src1_x, src1_y
3039     *
3040     * So:
3041     *
3042     * channel 0 is vertex_id < 2 ? src0_x : src1_x
3043     * channel 1 is vertex id & 1 ? src1_y : src0_y
3044     */
3045 
3046    nir_ssa_def *one = nir_imm_int(b, 1);
3047    nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
3048    nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
3049 
3050    nir_ssa_def *comp[4];
3051    comp[0] = nir_bcsel(b, c0cmp,
3052                        nir_channel(b, tex_box, 0),
3053                        nir_channel(b, tex_box, 2));
3054 
3055    comp[1] = nir_bcsel(b, c1cmp,
3056                        nir_channel(b, tex_box, 3),
3057                        nir_channel(b, tex_box, 1));
3058    comp[2] = tex_z;
3059    comp[3] = nir_imm_float(b, 1.0f);
3060    return nir_vec(b, comp, 4);
3061 }
3062 
3063 static nir_ssa_def *
build_nir_tex_op_read(struct nir_builder * b,nir_ssa_def * tex_pos,enum glsl_base_type tex_type,enum glsl_sampler_dim dim)3064 build_nir_tex_op_read(struct nir_builder *b,
3065                       nir_ssa_def *tex_pos,
3066                       enum glsl_base_type tex_type,
3067                       enum glsl_sampler_dim dim)
3068 {
3069    assert(dim != GLSL_SAMPLER_DIM_MS);
3070 
3071    const struct glsl_type *sampler_type =
3072       glsl_sampler_type(dim, false, false, tex_type);
3073    nir_variable *sampler =
3074       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3075    sampler->data.descriptor_set = 0;
3076    sampler->data.binding = 0;
3077 
3078    nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3079    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
3080    tex->sampler_dim = dim;
3081    tex->op = nir_texop_tex;
3082    tex->src[0].src_type = nir_tex_src_coord;
3083    tex->src[0].src = nir_src_for_ssa(tex_pos);
3084    tex->src[1].src_type = nir_tex_src_texture_deref;
3085    tex->src[1].src = nir_src_for_ssa(tex_deref);
3086    tex->src[2].src_type = nir_tex_src_sampler_deref;
3087    tex->src[2].src = nir_src_for_ssa(tex_deref);
3088    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
3089    tex->is_array = glsl_sampler_type_is_array(sampler_type);
3090    tex->coord_components = tex_pos->num_components;
3091 
3092    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
3093    nir_builder_instr_insert(b, &tex->instr);
3094    return &tex->dest.ssa;
3095 }
3096 
3097 static nir_ssa_def *
build_nir_tex_op_ms_fetch_sample(struct nir_builder * b,nir_variable * sampler,nir_ssa_def * tex_deref,enum glsl_base_type tex_type,nir_ssa_def * tex_pos,nir_ssa_def * sample_idx)3098 build_nir_tex_op_ms_fetch_sample(struct nir_builder *b,
3099                                  nir_variable *sampler,
3100                                  nir_ssa_def *tex_deref,
3101                                  enum glsl_base_type tex_type,
3102                                  nir_ssa_def *tex_pos,
3103                                  nir_ssa_def *sample_idx)
3104 {
3105    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 4);
3106    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
3107    tex->op = nir_texop_txf_ms;
3108    tex->src[0].src_type = nir_tex_src_coord;
3109    tex->src[0].src = nir_src_for_ssa(tex_pos);
3110    tex->src[1].src_type = nir_tex_src_texture_deref;
3111    tex->src[1].src = nir_src_for_ssa(tex_deref);
3112    tex->src[2].src_type = nir_tex_src_sampler_deref;
3113    tex->src[2].src = nir_src_for_ssa(tex_deref);
3114    tex->src[3].src_type = nir_tex_src_ms_index;
3115    tex->src[3].src = nir_src_for_ssa(sample_idx);
3116    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
3117    tex->is_array = false;
3118    tex->coord_components = tex_pos->num_components;
3119 
3120    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
3121    nir_builder_instr_insert(b, &tex->instr);
3122    return &tex->dest.ssa;
3123 }
3124 
3125 /* Fetches all samples at the given position and averages them */
3126 static nir_ssa_def *
build_nir_tex_op_ms_resolve(struct nir_builder * b,nir_ssa_def * tex_pos,enum glsl_base_type tex_type,VkSampleCountFlagBits src_samples)3127 build_nir_tex_op_ms_resolve(struct nir_builder *b,
3128                             nir_ssa_def *tex_pos,
3129                             enum glsl_base_type tex_type,
3130                             VkSampleCountFlagBits src_samples)
3131 {
3132    assert(src_samples > VK_SAMPLE_COUNT_1_BIT);
3133    const struct glsl_type *sampler_type =
3134       glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3135    nir_variable *sampler =
3136       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3137    sampler->data.descriptor_set = 0;
3138    sampler->data.binding = 0;
3139 
3140    const bool is_int = glsl_base_type_is_integer(tex_type);
3141 
3142    nir_ssa_def *tmp = NULL;
3143    nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3144    for (uint32_t i = 0; i < src_samples; i++) {
3145       nir_ssa_def *s =
3146          build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3147                                           tex_type, tex_pos,
3148                                           nir_imm_int(b, i));
3149 
3150       /* For integer formats, the multisample resolve operation is expected to
3151        * return one of the samples, we just return the first one.
3152        */
3153       if (is_int)
3154          return s;
3155 
3156       tmp = i == 0 ? s : nir_fadd(b, tmp, s);
3157    }
3158 
3159    assert(!is_int);
3160    return nir_fmul(b, tmp, nir_imm_float(b, 1.0f / src_samples));
3161 }
3162 
3163 /* Fetches the current sample (gl_SampleID) at the given position */
3164 static nir_ssa_def *
build_nir_tex_op_ms_read(struct nir_builder * b,nir_ssa_def * tex_pos,enum glsl_base_type tex_type)3165 build_nir_tex_op_ms_read(struct nir_builder *b,
3166                          nir_ssa_def *tex_pos,
3167                          enum glsl_base_type tex_type)
3168 {
3169    const struct glsl_type *sampler_type =
3170       glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3171    nir_variable *sampler =
3172       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3173    sampler->data.descriptor_set = 0;
3174    sampler->data.binding = 0;
3175 
3176    nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3177 
3178    return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3179                                            tex_type, tex_pos,
3180                                            nir_load_sample_id(b));
3181 }
3182 
3183 static nir_ssa_def *
build_nir_tex_op(struct nir_builder * b,struct v3dv_device * device,nir_ssa_def * tex_pos,enum glsl_base_type tex_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,enum glsl_sampler_dim dim)3184 build_nir_tex_op(struct nir_builder *b,
3185                  struct v3dv_device *device,
3186                  nir_ssa_def *tex_pos,
3187                  enum glsl_base_type tex_type,
3188                  VkSampleCountFlagBits dst_samples,
3189                  VkSampleCountFlagBits src_samples,
3190                  enum glsl_sampler_dim dim)
3191 {
3192    switch (dim) {
3193    case GLSL_SAMPLER_DIM_MS:
3194       assert(src_samples == VK_SAMPLE_COUNT_4_BIT);
3195       /* For multisampled texture sources we need to use fetching instead of
3196        * normalized texture coordinates. We already configured our blit
3197        * coordinates to be in texel units, but here we still need to convert
3198        * them from floating point to integer.
3199        */
3200       tex_pos = nir_f2i32(b, tex_pos);
3201 
3202       if (dst_samples == VK_SAMPLE_COUNT_1_BIT)
3203          return build_nir_tex_op_ms_resolve(b, tex_pos, tex_type, src_samples);
3204       else
3205          return build_nir_tex_op_ms_read(b, tex_pos, tex_type);
3206    default:
3207       assert(src_samples == VK_SAMPLE_COUNT_1_BIT);
3208       return build_nir_tex_op_read(b, tex_pos, tex_type, dim);
3209    }
3210 }
3211 
3212 static nir_shader *
get_blit_vs()3213 get_blit_vs()
3214 {
3215    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
3216    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
3217                                                   "meta blit vs");
3218 
3219    const struct glsl_type *vec4 = glsl_vec4_type();
3220 
3221    nir_variable *vs_out_pos =
3222       nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
3223    vs_out_pos->data.location = VARYING_SLOT_POS;
3224 
3225    nir_variable *vs_out_tex_coord =
3226       nir_variable_create(b.shader, nir_var_shader_out, vec4, "out_tex_coord");
3227    vs_out_tex_coord->data.location = VARYING_SLOT_VAR0;
3228    vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH;
3229 
3230    nir_ssa_def *pos = gen_rect_vertices(&b);
3231    nir_store_var(&b, vs_out_pos, pos, 0xf);
3232 
3233    nir_ssa_def *tex_coord = gen_tex_coords(&b);
3234    nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf);
3235 
3236    return b.shader;
3237 }
3238 
3239 static uint32_t
get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)3240 get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)
3241 {
3242    switch (sampler_dim) {
3243    case GLSL_SAMPLER_DIM_1D: return 0x1;
3244    case GLSL_SAMPLER_DIM_2D: return 0x3;
3245    case GLSL_SAMPLER_DIM_MS: return 0x3;
3246    case GLSL_SAMPLER_DIM_3D: return 0x7;
3247    default:
3248       unreachable("invalid sampler dim");
3249    };
3250 }
3251 
3252 static nir_shader *
get_color_blit_fs(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,enum glsl_sampler_dim sampler_dim)3253 get_color_blit_fs(struct v3dv_device *device,
3254                   VkFormat dst_format,
3255                   VkFormat src_format,
3256                   VkSampleCountFlagBits dst_samples,
3257                   VkSampleCountFlagBits src_samples,
3258                   enum glsl_sampler_dim sampler_dim)
3259 {
3260    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
3261    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
3262                                                   "meta blit fs");
3263 
3264    const struct glsl_type *vec4 = glsl_vec4_type();
3265 
3266    nir_variable *fs_in_tex_coord =
3267       nir_variable_create(b.shader, nir_var_shader_in, vec4, "in_tex_coord");
3268    fs_in_tex_coord->data.location = VARYING_SLOT_VAR0;
3269 
3270    const struct glsl_type *fs_out_type =
3271       vk_format_is_sint(dst_format) ? glsl_ivec4_type() :
3272       vk_format_is_uint(dst_format) ? glsl_uvec4_type() :
3273                                       glsl_vec4_type();
3274 
3275    enum glsl_base_type src_base_type =
3276       vk_format_is_sint(src_format) ? GLSL_TYPE_INT :
3277       vk_format_is_uint(src_format) ? GLSL_TYPE_UINT :
3278                                       GLSL_TYPE_FLOAT;
3279 
3280    nir_variable *fs_out_color =
3281       nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
3282    fs_out_color->data.location = FRAG_RESULT_DATA0;
3283 
3284    nir_ssa_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
3285    const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim);
3286    tex_coord = nir_channels(&b, tex_coord, channel_mask);
3287 
3288    nir_ssa_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
3289                                          dst_samples, src_samples, sampler_dim);
3290 
3291    /* For integer textures, if the bit-size of the destination is too small to
3292     * hold source value, Vulkan (CTS) expects the implementation to clamp to the
3293     * maximum value the destination can hold. The hardware can clamp to the
3294     * render target type, which usually matches the component bit-size, but
3295     * there are some cases that won't match, such as rgb10a2, which has a 16-bit
3296     * render target type, so in these cases we need to clamp manually.
3297     */
3298    if (format_needs_software_int_clamp(dst_format)) {
3299       assert(vk_format_is_int(dst_format));
3300       enum pipe_format src_pformat = vk_format_to_pipe_format(src_format);
3301       enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format);
3302 
3303       nir_ssa_def *c[4];
3304       for (uint32_t i = 0; i < 4; i++) {
3305          c[i] = nir_channel(&b, color, i);
3306 
3307          const uint32_t src_bit_size =
3308             util_format_get_component_bits(src_pformat,
3309                                            UTIL_FORMAT_COLORSPACE_RGB,
3310                                            i);
3311          const uint32_t dst_bit_size =
3312             util_format_get_component_bits(dst_pformat,
3313                                            UTIL_FORMAT_COLORSPACE_RGB,
3314                                            i);
3315 
3316          if (dst_bit_size >= src_bit_size)
3317             continue;
3318 
3319          assert(dst_bit_size > 0);
3320          if (util_format_is_pure_uint(dst_pformat)) {
3321             nir_ssa_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
3322             c[i] = nir_umin(&b, c[i], max);
3323          } else {
3324             nir_ssa_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
3325             nir_ssa_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
3326             c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min);
3327          }
3328       }
3329 
3330       color = nir_vec4(&b, c[0], c[1], c[2], c[3]);
3331    }
3332 
3333    nir_store_var(&b, fs_out_color, color, 0xf);
3334 
3335    return b.shader;
3336 }
3337 
3338 static bool
create_pipeline(struct v3dv_device * device,struct v3dv_render_pass * pass,struct nir_shader * vs_nir,struct nir_shader * gs_nir,struct nir_shader * fs_nir,const VkPipelineVertexInputStateCreateInfo * vi_state,const VkPipelineDepthStencilStateCreateInfo * ds_state,const VkPipelineColorBlendStateCreateInfo * cb_state,const VkPipelineMultisampleStateCreateInfo * ms_state,const VkPipelineLayout layout,VkPipeline * pipeline)3339 create_pipeline(struct v3dv_device *device,
3340                 struct v3dv_render_pass *pass,
3341                 struct nir_shader *vs_nir,
3342                 struct nir_shader *gs_nir,
3343                 struct nir_shader *fs_nir,
3344                 const VkPipelineVertexInputStateCreateInfo *vi_state,
3345                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
3346                 const VkPipelineColorBlendStateCreateInfo *cb_state,
3347                 const VkPipelineMultisampleStateCreateInfo *ms_state,
3348                 const VkPipelineLayout layout,
3349                 VkPipeline *pipeline)
3350 {
3351    struct vk_shader_module vs_m;
3352    struct vk_shader_module gs_m;
3353    struct vk_shader_module fs_m;
3354 
3355    uint32_t num_stages = gs_nir ? 3 : 2;
3356 
3357    v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
3358    v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
3359 
3360    VkPipelineShaderStageCreateInfo stages[3] = {
3361       {
3362          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3363          .stage = VK_SHADER_STAGE_VERTEX_BIT,
3364          .module = vk_shader_module_to_handle(&vs_m),
3365          .pName = "main",
3366       },
3367       {
3368          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3369          .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
3370          .module = vk_shader_module_to_handle(&fs_m),
3371          .pName = "main",
3372       },
3373       {
3374          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3375          .stage = VK_SHADER_STAGE_GEOMETRY_BIT,
3376          .module = VK_NULL_HANDLE,
3377          .pName = "main",
3378       },
3379    };
3380 
3381    if (gs_nir) {
3382       v3dv_shader_module_internal_init(device, &gs_m, gs_nir);
3383       stages[2].module = vk_shader_module_to_handle(&gs_m);
3384    }
3385 
3386    VkGraphicsPipelineCreateInfo info = {
3387       .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
3388 
3389       .stageCount = num_stages,
3390       .pStages = stages,
3391 
3392       .pVertexInputState = vi_state,
3393 
3394       .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
3395          .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
3396          .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
3397          .primitiveRestartEnable = false,
3398       },
3399 
3400       .pViewportState = &(VkPipelineViewportStateCreateInfo) {
3401          .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
3402          .viewportCount = 1,
3403          .scissorCount = 1,
3404       },
3405 
3406       .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
3407          .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
3408          .rasterizerDiscardEnable = false,
3409          .polygonMode = VK_POLYGON_MODE_FILL,
3410          .cullMode = VK_CULL_MODE_NONE,
3411          .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
3412          .depthBiasEnable = false,
3413       },
3414 
3415       .pMultisampleState = ms_state,
3416 
3417       .pDepthStencilState = ds_state,
3418 
3419       .pColorBlendState = cb_state,
3420 
3421       /* The meta clear pipeline declares all state as dynamic.
3422        * As a consequence, vkCmdBindPipeline writes no dynamic state
3423        * to the cmd buffer. Therefore, at the end of the meta clear,
3424        * we need only restore dynamic state that was vkCmdSet.
3425        */
3426       .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
3427          .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
3428          .dynamicStateCount = 6,
3429          .pDynamicStates = (VkDynamicState[]) {
3430             VK_DYNAMIC_STATE_VIEWPORT,
3431             VK_DYNAMIC_STATE_SCISSOR,
3432             VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
3433             VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
3434             VK_DYNAMIC_STATE_STENCIL_REFERENCE,
3435             VK_DYNAMIC_STATE_BLEND_CONSTANTS,
3436             VK_DYNAMIC_STATE_DEPTH_BIAS,
3437             VK_DYNAMIC_STATE_LINE_WIDTH,
3438          },
3439       },
3440 
3441       .flags = 0,
3442       .layout = layout,
3443       .renderPass = v3dv_render_pass_to_handle(pass),
3444       .subpass = 0,
3445    };
3446 
3447    VkResult result =
3448       v3dv_CreateGraphicsPipelines(v3dv_device_to_handle(device),
3449                                    VK_NULL_HANDLE,
3450                                    1, &info,
3451                                    &device->vk.alloc,
3452                                    pipeline);
3453 
3454    ralloc_free(vs_nir);
3455    ralloc_free(fs_nir);
3456 
3457    return result == VK_SUCCESS;
3458 }
3459 
3460 static enum glsl_sampler_dim
get_sampler_dim(VkImageType type,VkSampleCountFlagBits src_samples)3461 get_sampler_dim(VkImageType type, VkSampleCountFlagBits src_samples)
3462 {
3463    /* From the Vulkan 1.0 spec, VkImageCreateInfo Validu Usage:
3464     *
3465     *   "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
3466     *    VK_IMAGE_TYPE_2D, ..."
3467     */
3468    assert(src_samples == VK_SAMPLE_COUNT_1_BIT || type == VK_IMAGE_TYPE_2D);
3469 
3470    switch (type) {
3471    case VK_IMAGE_TYPE_1D: return GLSL_SAMPLER_DIM_1D;
3472    case VK_IMAGE_TYPE_2D:
3473       return src_samples == VK_SAMPLE_COUNT_1_BIT ? GLSL_SAMPLER_DIM_2D :
3474                                                     GLSL_SAMPLER_DIM_MS;
3475    case VK_IMAGE_TYPE_3D: return GLSL_SAMPLER_DIM_3D;
3476    default:
3477       unreachable("Invalid image type");
3478    }
3479 }
3480 
3481 static bool
create_blit_pipeline(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkImageType src_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,VkRenderPass _pass,VkPipelineLayout pipeline_layout,VkPipeline * pipeline)3482 create_blit_pipeline(struct v3dv_device *device,
3483                      VkFormat dst_format,
3484                      VkFormat src_format,
3485                      VkColorComponentFlags cmask,
3486                      VkImageType src_type,
3487                      VkSampleCountFlagBits dst_samples,
3488                      VkSampleCountFlagBits src_samples,
3489                      VkRenderPass _pass,
3490                      VkPipelineLayout pipeline_layout,
3491                      VkPipeline *pipeline)
3492 {
3493    struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
3494 
3495    /* We always rewrite depth/stencil blits to compatible color blits */
3496    assert(vk_format_is_color(dst_format));
3497    assert(vk_format_is_color(src_format));
3498 
3499    const enum glsl_sampler_dim sampler_dim =
3500       get_sampler_dim(src_type, src_samples);
3501 
3502    nir_shader *vs_nir = get_blit_vs();
3503    nir_shader *fs_nir =
3504       get_color_blit_fs(device, dst_format, src_format,
3505                         dst_samples, src_samples, sampler_dim);
3506 
3507    const VkPipelineVertexInputStateCreateInfo vi_state = {
3508       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
3509       .vertexBindingDescriptionCount = 0,
3510       .vertexAttributeDescriptionCount = 0,
3511    };
3512 
3513    VkPipelineDepthStencilStateCreateInfo ds_state = {
3514       .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
3515    };
3516 
3517    VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
3518    blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
3519       .blendEnable = false,
3520       .colorWriteMask = cmask,
3521    };
3522 
3523    const VkPipelineColorBlendStateCreateInfo cb_state = {
3524       .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
3525       .logicOpEnable = false,
3526       .attachmentCount = 1,
3527       .pAttachments = blend_att_state
3528    };
3529 
3530    const VkPipelineMultisampleStateCreateInfo ms_state = {
3531       .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
3532       .rasterizationSamples = dst_samples,
3533       .sampleShadingEnable = dst_samples > VK_SAMPLE_COUNT_1_BIT,
3534       .pSampleMask = NULL,
3535       .alphaToCoverageEnable = false,
3536       .alphaToOneEnable = false,
3537    };
3538 
3539    return create_pipeline(device,
3540                           pass,
3541                           vs_nir, NULL, fs_nir,
3542                           &vi_state,
3543                           &ds_state,
3544                           &cb_state,
3545                           &ms_state,
3546                           pipeline_layout,
3547                           pipeline);
3548 }
3549 
3550 /**
3551  * Return a pipeline suitable for blitting the requested aspect given the
3552  * destination and source formats.
3553  */
3554 static bool
get_blit_pipeline(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkImageType src_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,struct v3dv_meta_blit_pipeline ** pipeline)3555 get_blit_pipeline(struct v3dv_device *device,
3556                   VkFormat dst_format,
3557                   VkFormat src_format,
3558                   VkColorComponentFlags cmask,
3559                   VkImageType src_type,
3560                   VkSampleCountFlagBits dst_samples,
3561                   VkSampleCountFlagBits src_samples,
3562                   struct v3dv_meta_blit_pipeline **pipeline)
3563 {
3564    bool ok = true;
3565 
3566    uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
3567    get_blit_pipeline_cache_key(dst_format, src_format, cmask,
3568                                dst_samples, src_samples, key);
3569    mtx_lock(&device->meta.mtx);
3570    struct hash_entry *entry =
3571       _mesa_hash_table_search(device->meta.blit.cache[src_type], &key);
3572    if (entry) {
3573       mtx_unlock(&device->meta.mtx);
3574       *pipeline = entry->data;
3575       return true;
3576    }
3577 
3578    *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
3579                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
3580 
3581    if (*pipeline == NULL)
3582       goto fail;
3583 
3584    ok = create_blit_render_pass(device, dst_format, src_format,
3585                                 &(*pipeline)->pass,
3586                                 &(*pipeline)->pass_no_load);
3587    if (!ok)
3588       goto fail;
3589 
3590    /* Create the pipeline using one of the render passes, they are both
3591     * compatible, so we don't care which one we use here.
3592     */
3593    ok = create_blit_pipeline(device,
3594                              dst_format,
3595                              src_format,
3596                              cmask,
3597                              src_type,
3598                              dst_samples,
3599                              src_samples,
3600                              (*pipeline)->pass,
3601                              device->meta.blit.p_layout,
3602                              &(*pipeline)->pipeline);
3603    if (!ok)
3604       goto fail;
3605 
3606    memcpy((*pipeline)->key, key, sizeof((*pipeline)->key));
3607    _mesa_hash_table_insert(device->meta.blit.cache[src_type],
3608                            &(*pipeline)->key, *pipeline);
3609 
3610    mtx_unlock(&device->meta.mtx);
3611    return true;
3612 
3613 fail:
3614    mtx_unlock(&device->meta.mtx);
3615 
3616    VkDevice _device = v3dv_device_to_handle(device);
3617    if (*pipeline) {
3618       if ((*pipeline)->pass)
3619          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
3620       if ((*pipeline)->pass_no_load)
3621          v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->vk.alloc);
3622       if ((*pipeline)->pipeline)
3623          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
3624       vk_free(&device->vk.alloc, *pipeline);
3625       *pipeline = NULL;
3626    }
3627 
3628    return false;
3629 }
3630 
3631 static void
compute_blit_box(const VkOffset3D * offsets,uint32_t image_w,uint32_t image_h,uint32_t * x,uint32_t * y,uint32_t * w,uint32_t * h,bool * mirror_x,bool * mirror_y)3632 compute_blit_box(const VkOffset3D *offsets,
3633                  uint32_t image_w, uint32_t image_h,
3634                  uint32_t *x, uint32_t *y, uint32_t *w, uint32_t *h,
3635                  bool *mirror_x, bool *mirror_y)
3636 {
3637    if (offsets[1].x >= offsets[0].x) {
3638       *mirror_x = false;
3639       *x = MIN2(offsets[0].x, image_w - 1);
3640       *w = MIN2(offsets[1].x - offsets[0].x, image_w - offsets[0].x);
3641    } else {
3642       *mirror_x = true;
3643       *x = MIN2(offsets[1].x, image_w - 1);
3644       *w = MIN2(offsets[0].x - offsets[1].x, image_w - offsets[1].x);
3645    }
3646    if (offsets[1].y >= offsets[0].y) {
3647       *mirror_y = false;
3648       *y = MIN2(offsets[0].y, image_h - 1);
3649       *h = MIN2(offsets[1].y - offsets[0].y, image_h - offsets[0].y);
3650    } else {
3651       *mirror_y = true;
3652       *y = MIN2(offsets[1].y, image_h - 1);
3653       *h = MIN2(offsets[0].y - offsets[1].y, image_h - offsets[1].y);
3654    }
3655 }
3656 
3657 static void
compute_blit_3d_layers(const VkOffset3D * offsets,uint32_t * min_layer,uint32_t * max_layer,bool * mirror_z)3658 compute_blit_3d_layers(const VkOffset3D *offsets,
3659                        uint32_t *min_layer, uint32_t *max_layer,
3660                        bool *mirror_z)
3661 {
3662    if (offsets[1].z >= offsets[0].z) {
3663       *mirror_z = false;
3664       *min_layer = offsets[0].z;
3665       *max_layer = offsets[1].z;
3666    } else {
3667       *mirror_z = true;
3668       *min_layer = offsets[1].z;
3669       *max_layer = offsets[0].z;
3670    }
3671 }
3672 
3673 static VkResult
create_blit_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)3674 create_blit_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
3675 {
3676    /* If this is not the first pool we create for this command buffer
3677     * size it based on the size of the currently exhausted pool.
3678     */
3679    uint32_t descriptor_count = 64;
3680    if (cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE) {
3681       struct v3dv_descriptor_pool *exhausted_pool =
3682          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.blit.dspool);
3683       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
3684    }
3685 
3686    /* Create the descriptor pool */
3687    cmd_buffer->meta.blit.dspool = VK_NULL_HANDLE;
3688    VkDescriptorPoolSize pool_size = {
3689       .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
3690       .descriptorCount = descriptor_count,
3691    };
3692    VkDescriptorPoolCreateInfo info = {
3693       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
3694       .maxSets = descriptor_count,
3695       .poolSizeCount = 1,
3696       .pPoolSizes = &pool_size,
3697       .flags = 0,
3698    };
3699    VkResult result =
3700       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
3701                                 &info,
3702                                 &cmd_buffer->device->vk.alloc,
3703                                 &cmd_buffer->meta.blit.dspool);
3704 
3705    if (result == VK_SUCCESS) {
3706       assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
3707       const VkDescriptorPool _pool = cmd_buffer->meta.blit.dspool;
3708 
3709       v3dv_cmd_buffer_add_private_obj(
3710          cmd_buffer, (uintptr_t) _pool,
3711          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
3712 
3713       struct v3dv_descriptor_pool *pool =
3714          v3dv_descriptor_pool_from_handle(_pool);
3715       pool->is_driver_internal = true;
3716    }
3717 
3718    return result;
3719 }
3720 
3721 static VkResult
allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)3722 allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
3723                                     VkDescriptorSet *set)
3724 {
3725    /* Make sure we have a descriptor pool */
3726    VkResult result;
3727    if (cmd_buffer->meta.blit.dspool == VK_NULL_HANDLE) {
3728       result = create_blit_descriptor_pool(cmd_buffer);
3729       if (result != VK_SUCCESS)
3730          return result;
3731    }
3732    assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
3733 
3734    /* Allocate descriptor set */
3735    struct v3dv_device *device = cmd_buffer->device;
3736    VkDevice _device = v3dv_device_to_handle(device);
3737    VkDescriptorSetAllocateInfo info = {
3738       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
3739       .descriptorPool = cmd_buffer->meta.blit.dspool,
3740       .descriptorSetCount = 1,
3741       .pSetLayouts = &device->meta.blit.ds_layout,
3742    };
3743    result = v3dv_AllocateDescriptorSets(_device, &info, set);
3744 
3745    /* If we ran out of pool space, grow the pool and try again */
3746    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
3747       result = create_blit_descriptor_pool(cmd_buffer);
3748       if (result == VK_SUCCESS) {
3749          info.descriptorPool = cmd_buffer->meta.blit.dspool;
3750          result = v3dv_AllocateDescriptorSets(_device, &info, set);
3751       }
3752    }
3753 
3754    return result;
3755 }
3756 
3757 /**
3758  * Returns true if the implementation supports the requested operation (even if
3759  * it failed to process it, for example, due to an out-of-memory error).
3760  *
3761  * The caller can specify the channels on the destination to be written via the
3762  * cmask parameter (which can be 0 to default to all channels), as well as a
3763  * swizzle to apply to the source via the cswizzle parameter  (which can be NULL
3764  * to use the default identity swizzle).
3765  */
3766 static bool
blit_shader(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,VkFormat dst_format,struct v3dv_image * src,VkFormat src_format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,const VkImageBlit2KHR * _region,VkFilter filter,bool dst_is_padded_image)3767 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
3768             struct v3dv_image *dst,
3769             VkFormat dst_format,
3770             struct v3dv_image *src,
3771             VkFormat src_format,
3772             VkColorComponentFlags cmask,
3773             VkComponentMapping *cswizzle,
3774             const VkImageBlit2KHR *_region,
3775             VkFilter filter,
3776             bool dst_is_padded_image)
3777 {
3778    bool handled = true;
3779    VkResult result;
3780    uint32_t dirty_dynamic_state = 0;
3781 
3782    /* We don't support rendering to linear depth/stencil, this should have
3783     * been rewritten to a compatible color blit by the caller.
3784     */
3785    assert(dst->vk.tiling != VK_IMAGE_TILING_LINEAR ||
3786           !vk_format_is_depth_or_stencil(dst_format));
3787 
3788    /* Can't sample from linear images */
3789    if (src->vk.tiling == VK_IMAGE_TILING_LINEAR && src->vk.image_type != VK_IMAGE_TYPE_1D)
3790       return false;
3791 
3792    VkImageBlit2KHR region = *_region;
3793    /* Rewrite combined D/S blits to compatible color blits */
3794    if (vk_format_is_depth_or_stencil(dst_format)) {
3795       assert(src_format == dst_format);
3796       assert(cmask == 0);
3797       switch(dst_format) {
3798       case VK_FORMAT_D16_UNORM:
3799          dst_format = VK_FORMAT_R16_UINT;
3800          break;
3801       case VK_FORMAT_D32_SFLOAT:
3802          dst_format = VK_FORMAT_R32_UINT;
3803          break;
3804       case VK_FORMAT_X8_D24_UNORM_PACK32:
3805       case VK_FORMAT_D24_UNORM_S8_UINT:
3806          if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3807             cmask |= VK_COLOR_COMPONENT_G_BIT |
3808                      VK_COLOR_COMPONENT_B_BIT |
3809                      VK_COLOR_COMPONENT_A_BIT;
3810          }
3811          if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3812             assert(dst_format == VK_FORMAT_D24_UNORM_S8_UINT);
3813             cmask |= VK_COLOR_COMPONENT_R_BIT;
3814          }
3815          dst_format = VK_FORMAT_R8G8B8A8_UINT;
3816          break;
3817       default:
3818          unreachable("Unsupported depth/stencil format");
3819       };
3820       src_format = dst_format;
3821       region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
3822       region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
3823    }
3824 
3825    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
3826                                             VK_COLOR_COMPONENT_G_BIT |
3827                                             VK_COLOR_COMPONENT_B_BIT |
3828                                             VK_COLOR_COMPONENT_A_BIT;
3829    if (cmask == 0)
3830       cmask = full_cmask;
3831 
3832    VkComponentMapping ident_swizzle = {
3833       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
3834       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
3835       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
3836       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
3837    };
3838    if (!cswizzle)
3839       cswizzle = &ident_swizzle;
3840 
3841    /* When we get here from a copy between compressed / uncompressed images
3842     * we choose to specify the destination blit region based on the size
3843     * semantics of the source image of the copy (see copy_image_blit), so we
3844     * need to apply those same semantics here when we compute the size of the
3845     * destination image level.
3846     */
3847    const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
3848    const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
3849    const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
3850    const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
3851    const uint32_t dst_level_w =
3852       u_minify(DIV_ROUND_UP(dst->vk.extent.width * src_block_w, dst_block_w),
3853                region.dstSubresource.mipLevel);
3854    const uint32_t dst_level_h =
3855       u_minify(DIV_ROUND_UP(dst->vk.extent.height * src_block_h, dst_block_h),
3856                region.dstSubresource.mipLevel);
3857 
3858    const uint32_t src_level_w =
3859       u_minify(src->vk.extent.width, region.srcSubresource.mipLevel);
3860    const uint32_t src_level_h =
3861       u_minify(src->vk.extent.height, region.srcSubresource.mipLevel);
3862    const uint32_t src_level_d =
3863       u_minify(src->vk.extent.depth, region.srcSubresource.mipLevel);
3864 
3865    uint32_t dst_x, dst_y, dst_w, dst_h;
3866    bool dst_mirror_x, dst_mirror_y;
3867    compute_blit_box(region.dstOffsets,
3868                     dst_level_w, dst_level_h,
3869                     &dst_x, &dst_y, &dst_w, &dst_h,
3870                     &dst_mirror_x, &dst_mirror_y);
3871 
3872    uint32_t src_x, src_y, src_w, src_h;
3873    bool src_mirror_x, src_mirror_y;
3874    compute_blit_box(region.srcOffsets,
3875                     src_level_w, src_level_h,
3876                     &src_x, &src_y, &src_w, &src_h,
3877                     &src_mirror_x, &src_mirror_y);
3878 
3879    uint32_t min_dst_layer;
3880    uint32_t max_dst_layer;
3881    bool dst_mirror_z = false;
3882    if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
3883       min_dst_layer = region.dstSubresource.baseArrayLayer;
3884       max_dst_layer = min_dst_layer + region.dstSubresource.layerCount;
3885    } else {
3886       compute_blit_3d_layers(region.dstOffsets,
3887                              &min_dst_layer, &max_dst_layer,
3888                              &dst_mirror_z);
3889    }
3890 
3891    uint32_t min_src_layer;
3892    uint32_t max_src_layer;
3893    bool src_mirror_z = false;
3894    if (src->vk.image_type != VK_IMAGE_TYPE_3D) {
3895       min_src_layer = region.srcSubresource.baseArrayLayer;
3896       max_src_layer = min_src_layer + region.srcSubresource.layerCount;
3897    } else {
3898       compute_blit_3d_layers(region.srcOffsets,
3899                              &min_src_layer, &max_src_layer,
3900                              &src_mirror_z);
3901    }
3902 
3903    uint32_t layer_count = max_dst_layer - min_dst_layer;
3904 
3905    /* Translate source blit coordinates to normalized texture coordinates for
3906     * single sampled textures. For multisampled textures we require
3907     * unnormalized coordinates, since we can only do texelFetch on them.
3908     */
3909    float coords[4] =  {
3910       (float)src_x,
3911       (float)src_y,
3912       (float)(src_x + src_w),
3913       (float)(src_y + src_h),
3914    };
3915 
3916    if (src->vk.samples == VK_SAMPLE_COUNT_1_BIT) {
3917       coords[0] /= (float)src_level_w;
3918       coords[1] /= (float)src_level_h;
3919       coords[2] /= (float)src_level_w;
3920       coords[3] /= (float)src_level_h;
3921    }
3922 
3923    /* Handle mirroring */
3924    const bool mirror_x = dst_mirror_x != src_mirror_x;
3925    const bool mirror_y = dst_mirror_y != src_mirror_y;
3926    const bool mirror_z = dst_mirror_z != src_mirror_z;
3927    float tex_coords[5] = {
3928       !mirror_x ? coords[0] : coords[2],
3929       !mirror_y ? coords[1] : coords[3],
3930       !mirror_x ? coords[2] : coords[0],
3931       !mirror_y ? coords[3] : coords[1],
3932       /* Z coordinate for 3D blit sources, to be filled for each
3933        * destination layer
3934        */
3935       0.0f
3936    };
3937 
3938    /* For blits from 3D images we also need to compute the slice coordinate to
3939     * sample from, which will change for each layer in the destination.
3940     * Compute the step we should increase for each iteration.
3941     */
3942    const float src_z_step =
3943       (float)(max_src_layer - min_src_layer) / (float)layer_count;
3944 
3945    /* Get the blit pipeline */
3946    struct v3dv_meta_blit_pipeline *pipeline = NULL;
3947    bool ok = get_blit_pipeline(cmd_buffer->device,
3948                                dst_format, src_format, cmask, src->vk.image_type,
3949                                dst->vk.samples, src->vk.samples,
3950                                &pipeline);
3951    if (!ok)
3952       return handled;
3953    assert(pipeline && pipeline->pipeline &&
3954           pipeline->pass && pipeline->pass_no_load);
3955 
3956    struct v3dv_device *device = cmd_buffer->device;
3957    assert(device->meta.blit.ds_layout);
3958 
3959    VkDevice _device = v3dv_device_to_handle(device);
3960    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
3961 
3962    /* Create sampler for blit source image */
3963    VkSamplerCreateInfo sampler_info = {
3964       .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
3965       .magFilter = filter,
3966       .minFilter = filter,
3967       .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
3968       .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
3969       .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
3970       .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST,
3971    };
3972    VkSampler sampler;
3973    result = v3dv_CreateSampler(_device, &sampler_info, &device->vk.alloc,
3974                                &sampler);
3975    if (result != VK_SUCCESS)
3976       goto fail;
3977 
3978    v3dv_cmd_buffer_add_private_obj(
3979       cmd_buffer, (uintptr_t)sampler,
3980       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroySampler);
3981 
3982    /* Push command buffer state before starting meta operation */
3983    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
3984 
3985    /* Push state that is common for all layers */
3986    v3dv_CmdBindPipeline(_cmd_buffer,
3987                         VK_PIPELINE_BIND_POINT_GRAPHICS,
3988                         pipeline->pipeline);
3989 
3990    const VkViewport viewport = {
3991       .x = dst_x,
3992       .y = dst_y,
3993       .width = dst_w,
3994       .height = dst_h,
3995       .minDepth = 0.0f,
3996       .maxDepth = 1.0f
3997    };
3998    v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
3999 
4000    const VkRect2D scissor = {
4001       .offset = { dst_x, dst_y },
4002       .extent = { dst_w, dst_h }
4003    };
4004    v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
4005 
4006    bool can_skip_tlb_load = false;
4007    const VkRect2D render_area = {
4008       .offset = { dst_x, dst_y },
4009       .extent = { dst_w, dst_h },
4010    };
4011 
4012    /* Record per-layer commands */
4013    VkImageAspectFlags aspects = region.dstSubresource.aspectMask;
4014    for (uint32_t i = 0; i < layer_count; i++) {
4015       /* Setup framebuffer */
4016       VkImageViewCreateInfo dst_image_view_info = {
4017          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4018          .image = v3dv_image_to_handle(dst),
4019          .viewType = v3dv_image_type_to_view_type(dst->vk.image_type),
4020          .format = dst_format,
4021          .subresourceRange = {
4022             .aspectMask = aspects,
4023             .baseMipLevel = region.dstSubresource.mipLevel,
4024             .levelCount = 1,
4025             .baseArrayLayer = min_dst_layer + i,
4026             .layerCount = 1
4027          },
4028       };
4029       VkImageView dst_image_view;
4030       result = v3dv_CreateImageView(_device, &dst_image_view_info,
4031                                     &device->vk.alloc, &dst_image_view);
4032       if (result != VK_SUCCESS)
4033          goto fail;
4034 
4035       v3dv_cmd_buffer_add_private_obj(
4036          cmd_buffer, (uintptr_t)dst_image_view,
4037          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4038 
4039       VkFramebufferCreateInfo fb_info = {
4040          .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
4041          .renderPass = pipeline->pass,
4042          .attachmentCount = 1,
4043          .pAttachments = &dst_image_view,
4044          .width = dst_x + dst_w,
4045          .height = dst_y + dst_h,
4046          .layers = 1,
4047       };
4048 
4049       VkFramebuffer fb;
4050       result = v3dv_CreateFramebuffer(_device, &fb_info,
4051                                       &cmd_buffer->device->vk.alloc, &fb);
4052       if (result != VK_SUCCESS)
4053          goto fail;
4054 
4055       struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
4056       framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
4057                                       fb_info.height == dst_level_h &&
4058                                       dst_is_padded_image;
4059 
4060       v3dv_cmd_buffer_add_private_obj(
4061          cmd_buffer, (uintptr_t)fb,
4062          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
4063 
4064       /* Setup descriptor set for blit source texture. We don't have to
4065        * register the descriptor as a private command buffer object since
4066        * all descriptors will be freed automatically with the descriptor
4067        * pool.
4068        */
4069       VkDescriptorSet set;
4070       result = allocate_blit_source_descriptor_set(cmd_buffer, &set);
4071       if (result != VK_SUCCESS)
4072          goto fail;
4073 
4074       VkImageViewCreateInfo src_image_view_info = {
4075          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4076          .image = v3dv_image_to_handle(src),
4077          .viewType = v3dv_image_type_to_view_type(src->vk.image_type),
4078          .format = src_format,
4079          .components = *cswizzle,
4080          .subresourceRange = {
4081             .aspectMask = aspects,
4082             .baseMipLevel = region.srcSubresource.mipLevel,
4083             .levelCount = 1,
4084             .baseArrayLayer =
4085                src->vk.image_type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
4086             .layerCount = 1
4087          },
4088       };
4089       VkImageView src_image_view;
4090       result = v3dv_CreateImageView(_device, &src_image_view_info,
4091                                     &device->vk.alloc, &src_image_view);
4092       if (result != VK_SUCCESS)
4093          goto fail;
4094 
4095       v3dv_cmd_buffer_add_private_obj(
4096          cmd_buffer, (uintptr_t)src_image_view,
4097          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4098 
4099       VkDescriptorImageInfo image_info = {
4100          .sampler = sampler,
4101          .imageView = src_image_view,
4102          .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
4103       };
4104       VkWriteDescriptorSet write = {
4105          .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
4106          .dstSet = set,
4107          .dstBinding = 0,
4108          .dstArrayElement = 0,
4109          .descriptorCount = 1,
4110          .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
4111          .pImageInfo = &image_info,
4112       };
4113       v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
4114 
4115       v3dv_CmdBindDescriptorSets(_cmd_buffer,
4116                                  VK_PIPELINE_BIND_POINT_GRAPHICS,
4117                                  device->meta.blit.p_layout,
4118                                  0, 1, &set,
4119                                  0, NULL);
4120 
4121       /* If the region we are about to blit is tile-aligned, then we can
4122        * use the render pass version that won't pre-load the tile buffer
4123        * with the dst image contents before the blit. The exception is when we
4124        * don't have a full color mask, since in that case we need to preserve
4125        * the original value of some of the color components.
4126        *
4127        * Since all layers have the same area, we only need to compute this for
4128        * the first.
4129        */
4130       if (i == 0) {
4131          struct v3dv_render_pass *pipeline_pass =
4132             v3dv_render_pass_from_handle(pipeline->pass);
4133          can_skip_tlb_load =
4134             cmask == full_cmask &&
4135             v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
4136                                               framebuffer, pipeline_pass, 0);
4137       }
4138 
4139       /* Record blit */
4140       VkRenderPassBeginInfo rp_info = {
4141          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
4142          .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
4143                                            pipeline->pass,
4144          .framebuffer = fb,
4145          .renderArea = render_area,
4146          .clearValueCount = 0,
4147       };
4148 
4149       v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
4150       struct v3dv_job *job = cmd_buffer->state.job;
4151       if (!job)
4152          goto fail;
4153 
4154       /* For 3D blits we need to compute the source slice to blit from (the Z
4155        * coordinate of the source sample operation). We want to choose this
4156        * based on the ratio of the depth of the source and the destination
4157        * images, picking the coordinate in the middle of each step.
4158        */
4159       if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
4160          tex_coords[4] =
4161             !mirror_z ?
4162             (min_src_layer + (i + 0.5f) * src_z_step) / (float)src_level_d :
4163             (max_src_layer - (i + 0.5f) * src_z_step) / (float)src_level_d;
4164       }
4165 
4166       v3dv_CmdPushConstants(_cmd_buffer,
4167                             device->meta.blit.p_layout,
4168                             VK_SHADER_STAGE_VERTEX_BIT, 0, 20,
4169                             &tex_coords);
4170 
4171       v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
4172 
4173       v3dv_CmdEndRenderPass(_cmd_buffer);
4174       dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
4175    }
4176 
4177 fail:
4178    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
4179 
4180    return handled;
4181 }
4182 
4183 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,const VkBlitImageInfo2KHR * pBlitImageInfo)4184 v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
4185                       const VkBlitImageInfo2KHR *pBlitImageInfo)
4186 {
4187    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4188    V3DV_FROM_HANDLE(v3dv_image, src, pBlitImageInfo->srcImage);
4189    V3DV_FROM_HANDLE(v3dv_image, dst, pBlitImageInfo->dstImage);
4190 
4191     /* This command can only happen outside a render pass */
4192    assert(cmd_buffer->state.pass == NULL);
4193    assert(cmd_buffer->state.job == NULL);
4194 
4195    /* From the Vulkan 1.0 spec, vkCmdBlitImage valid usage */
4196    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
4197           src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
4198 
4199    /* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */
4200    assert(!vk_format_is_compressed(dst->vk.format));
4201 
4202    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) {
4203       if (blit_tfu(cmd_buffer, dst, src, &pBlitImageInfo->pRegions[i]))
4204          continue;
4205       if (blit_shader(cmd_buffer,
4206                       dst, dst->vk.format,
4207                       src, src->vk.format,
4208                       0, NULL,
4209                       &pBlitImageInfo->pRegions[i],
4210                       pBlitImageInfo->filter, true)) {
4211          continue;
4212       }
4213       unreachable("Unsupported blit operation");
4214    }
4215 }
4216 
4217 static bool
resolve_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageResolve2KHR * region)4218 resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
4219                   struct v3dv_image *dst,
4220                   struct v3dv_image *src,
4221                   const VkImageResolve2KHR *region)
4222 {
4223    if (!v3dv_meta_can_use_tlb(src, &region->srcOffset, NULL) ||
4224        !v3dv_meta_can_use_tlb(dst, &region->dstOffset, NULL)) {
4225       return false;
4226    }
4227 
4228    if (!v3dv_X(cmd_buffer->device, format_supports_tlb_resolve)(src->format))
4229       return false;
4230 
4231    const VkFormat fb_format = src->vk.format;
4232 
4233    uint32_t num_layers;
4234    if (dst->vk.image_type != VK_IMAGE_TYPE_3D)
4235       num_layers = region->dstSubresource.layerCount;
4236    else
4237       num_layers = region->extent.depth;
4238    assert(num_layers > 0);
4239 
4240    struct v3dv_job *job =
4241       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
4242    if (!job)
4243       return true;
4244 
4245    const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
4246    const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
4247    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
4248    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
4249 
4250    uint32_t internal_type, internal_bpp;
4251    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
4252       (fb_format, region->srcSubresource.aspectMask,
4253        &internal_type, &internal_bpp);
4254 
4255    v3dv_job_start_frame(job, width, height, num_layers, false,
4256                         1, internal_bpp, true);
4257 
4258    struct v3dv_meta_framebuffer framebuffer;
4259    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
4260                                               internal_type, &job->frame_tiling);
4261 
4262    v3dv_X(job->device, job_emit_binning_flush)(job);
4263    v3dv_X(job->device, meta_emit_resolve_image_rcl)(job, dst, src,
4264                                                     &framebuffer, region);
4265 
4266    v3dv_cmd_buffer_finish_job(cmd_buffer);
4267    return true;
4268 }
4269 
4270 static bool
resolve_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageResolve2KHR * region)4271 resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
4272                    struct v3dv_image *dst,
4273                    struct v3dv_image *src,
4274                    const VkImageResolve2KHR *region)
4275 {
4276    const VkImageBlit2KHR blit_region = {
4277       .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
4278       .srcSubresource = region->srcSubresource,
4279       .srcOffsets = {
4280          region->srcOffset,
4281          {
4282             region->srcOffset.x + region->extent.width,
4283             region->srcOffset.y + region->extent.height,
4284          }
4285       },
4286       .dstSubresource = region->dstSubresource,
4287       .dstOffsets = {
4288          region->dstOffset,
4289          {
4290             region->dstOffset.x + region->extent.width,
4291             region->dstOffset.y + region->extent.height,
4292          }
4293       },
4294    };
4295    return blit_shader(cmd_buffer,
4296                       dst, dst->vk.format,
4297                       src, src->vk.format,
4298                       0, NULL,
4299                       &blit_region, VK_FILTER_NEAREST, true);
4300 }
4301 
4302 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,const VkResolveImageInfo2KHR * info)4303 v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
4304                          const VkResolveImageInfo2KHR *info)
4305 
4306 {
4307    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4308    V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
4309    V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
4310 
4311     /* This command can only happen outside a render pass */
4312    assert(cmd_buffer->state.pass == NULL);
4313    assert(cmd_buffer->state.job == NULL);
4314 
4315    assert(src->vk.samples == VK_SAMPLE_COUNT_4_BIT);
4316    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
4317 
4318    for (uint32_t i = 0; i < info->regionCount; i++) {
4319       if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
4320          continue;
4321       if (resolve_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
4322          continue;
4323       unreachable("Unsupported multismaple resolve operation");
4324    }
4325 }
4326