1 /*
2  * Copyright © 2021 Raspberry Pi
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "v3dv_meta_common.h"
26 
27 #include "broadcom/common/v3d_macros.h"
28 #include "broadcom/cle/v3dx_pack.h"
29 #include "broadcom/compiler/v3d_compiler.h"
30 
31 #include "vk_format_info.h"
32 
33 struct rcl_clear_info {
34    const union v3dv_clear_value *clear_value;
35    struct v3dv_image *image;
36    VkImageAspectFlags aspects;
37    uint32_t level;
38 };
39 
40 static struct v3dv_cl *
emit_rcl_prologue(struct v3dv_job * job,struct v3dv_meta_framebuffer * fb,const struct rcl_clear_info * clear_info)41 emit_rcl_prologue(struct v3dv_job *job,
42                   struct v3dv_meta_framebuffer *fb,
43                   const struct rcl_clear_info *clear_info)
44 {
45    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
46 
47    struct v3dv_cl *rcl = &job->rcl;
48    v3dv_cl_ensure_space_with_branch(rcl, 200 +
49                                     tiling->layers * 256 *
50                                     cl_packet_length(SUPERTILE_COORDINATES));
51    if (job->cmd_buffer->state.oom)
52       return NULL;
53 
54    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
55       config.early_z_disable = true;
56       config.image_width_pixels = tiling->width;
57       config.image_height_pixels = tiling->height;
58       config.number_of_render_targets = 1;
59       config.multisample_mode_4x = tiling->msaa;
60       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
61       config.internal_depth_type = fb->internal_depth_type;
62    }
63 
64    if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
65       uint32_t clear_pad = 0;
66       if (clear_info->image) {
67          const struct v3dv_image *image = clear_info->image;
68          const struct v3d_resource_slice *slice =
69             &image->slices[clear_info->level];
70          if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
71              slice->tiling == V3D_TILING_UIF_XOR) {
72             int uif_block_height = v3d_utile_height(image->cpp) * 2;
73 
74             uint32_t implicit_padded_height =
75                align(tiling->height, uif_block_height) / uif_block_height;
76 
77             if (slice->padded_height_of_output_image_in_uif_blocks -
78                 implicit_padded_height >= 15) {
79                clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
80             }
81          }
82       }
83 
84       const uint32_t *color = &clear_info->clear_value->color[0];
85       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
86          clear.clear_color_low_32_bits = color[0];
87          clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
88          clear.render_target_number = 0;
89       };
90 
91       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
92          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
93             clear.clear_color_mid_low_32_bits =
94               ((color[1] >> 24) | (color[2] << 8));
95             clear.clear_color_mid_high_24_bits =
96               ((color[2] >> 24) | ((color[3] & 0xffff) << 8));
97             clear.render_target_number = 0;
98          };
99       }
100 
101       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
102          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
103             clear.uif_padded_height_in_uif_blocks = clear_pad;
104             clear.clear_color_high_16_bits = color[3] >> 16;
105             clear.render_target_number = 0;
106          };
107       }
108    }
109 
110    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
111       rt.render_target_0_internal_bpp = tiling->internal_bpp;
112       rt.render_target_0_internal_type = fb->internal_type;
113       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
114    }
115 
116    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
117       clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
118       clear.stencil_clear_value = clear_info ? clear_info->clear_value->s : 0;
119    };
120 
121    cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
122       init.use_auto_chained_tile_lists = true;
123       init.size_of_first_block_in_chained_tile_lists =
124          TILE_ALLOCATION_BLOCK_SIZE_64B;
125    }
126 
127    return rcl;
128 }
129 
130 static void
emit_frame_setup(struct v3dv_job * job,uint32_t min_layer,const union v3dv_clear_value * clear_value)131 emit_frame_setup(struct v3dv_job *job,
132                  uint32_t min_layer,
133                  const union v3dv_clear_value *clear_value)
134 {
135    v3dv_return_if_oom(NULL, job);
136 
137    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
138 
139    struct v3dv_cl *rcl = &job->rcl;
140 
141    const uint32_t tile_alloc_offset =
142       64 * min_layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
143    cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
144       list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
145    }
146 
147    cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
148       config.number_of_bin_tile_lists = 1;
149       config.total_frame_width_in_tiles = tiling->draw_tiles_x;
150       config.total_frame_height_in_tiles = tiling->draw_tiles_y;
151 
152       config.supertile_width_in_tiles = tiling->supertile_width;
153       config.supertile_height_in_tiles = tiling->supertile_height;
154 
155       config.total_frame_width_in_supertiles =
156          tiling->frame_width_in_supertiles;
157       config.total_frame_height_in_supertiles =
158          tiling->frame_height_in_supertiles;
159    }
160 
161    /* Implement GFXH-1742 workaround. Also, if we are clearing we have to do
162     * it here.
163     */
164    for (int i = 0; i < 2; i++) {
165       cl_emit(rcl, TILE_COORDINATES, coords);
166       cl_emit(rcl, END_OF_LOADS, end);
167       cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
168          store.buffer_to_store = NONE;
169       }
170       if (clear_value && i == 0) {
171          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
172             clear.clear_z_stencil_buffer = true;
173             clear.clear_all_render_targets = true;
174          }
175       }
176       cl_emit(rcl, END_OF_TILE_MARKER, end);
177    }
178 
179    cl_emit(rcl, FLUSH_VCD_CACHE, flush);
180 }
181 
182 static void
emit_supertile_coordinates(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer)183 emit_supertile_coordinates(struct v3dv_job *job,
184                            struct v3dv_meta_framebuffer *framebuffer)
185 {
186    v3dv_return_if_oom(NULL, job);
187 
188    struct v3dv_cl *rcl = &job->rcl;
189 
190    const uint32_t min_y = framebuffer->min_y_supertile;
191    const uint32_t max_y = framebuffer->max_y_supertile;
192    const uint32_t min_x = framebuffer->min_x_supertile;
193    const uint32_t max_x = framebuffer->max_x_supertile;
194 
195    for (int y = min_y; y <= max_y; y++) {
196       for (int x = min_x; x <= max_x; x++) {
197          cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
198             coords.column_number_in_supertiles = x;
199             coords.row_number_in_supertiles = y;
200          }
201       }
202    }
203 }
204 
205 static void
emit_linear_load(struct v3dv_cl * cl,uint32_t buffer,struct v3dv_bo * bo,uint32_t offset,uint32_t stride,uint32_t format)206 emit_linear_load(struct v3dv_cl *cl,
207                  uint32_t buffer,
208                  struct v3dv_bo *bo,
209                  uint32_t offset,
210                  uint32_t stride,
211                  uint32_t format)
212 {
213    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
214       load.buffer_to_load = buffer;
215       load.address = v3dv_cl_address(bo, offset);
216       load.input_image_format = format;
217       load.memory_format = V3D_TILING_RASTER;
218       load.height_in_ub_or_stride = stride;
219       load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
220    }
221 }
222 
223 static void
emit_linear_store(struct v3dv_cl * cl,uint32_t buffer,struct v3dv_bo * bo,uint32_t offset,uint32_t stride,bool msaa,uint32_t format)224 emit_linear_store(struct v3dv_cl *cl,
225                   uint32_t buffer,
226                   struct v3dv_bo *bo,
227                   uint32_t offset,
228                   uint32_t stride,
229                   bool msaa,
230                   uint32_t format)
231 {
232    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
233       store.buffer_to_store = RENDER_TARGET_0;
234       store.address = v3dv_cl_address(bo, offset);
235       store.clear_buffer_being_stored = false;
236       store.output_image_format = format;
237       store.memory_format = V3D_TILING_RASTER;
238       store.height_in_ub_or_stride = stride;
239       store.decimate_mode = msaa ? V3D_DECIMATE_MODE_ALL_SAMPLES :
240                                    V3D_DECIMATE_MODE_SAMPLE_0;
241    }
242 }
243 
244 /* This chooses a tile buffer format that is appropriate for the copy operation.
245  * Typically, this is the image render target type, however, if we are copying
246  * depth/stencil to/from a buffer the hardware can't do raster loads/stores, so
247  * we need to load and store to/from a tile color buffer using a compatible
248  * color format.
249  */
250 static uint32_t
choose_tlb_format(struct v3dv_meta_framebuffer * framebuffer,VkImageAspectFlags aspect,bool for_store,bool is_copy_to_buffer,bool is_copy_from_buffer)251 choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer,
252                   VkImageAspectFlags aspect,
253                   bool for_store,
254                   bool is_copy_to_buffer,
255                   bool is_copy_from_buffer)
256 {
257    if (is_copy_to_buffer || is_copy_from_buffer) {
258       switch (framebuffer->vk_format) {
259       case VK_FORMAT_D16_UNORM:
260          return V3D_OUTPUT_IMAGE_FORMAT_R16UI;
261       case VK_FORMAT_D32_SFLOAT:
262          return V3D_OUTPUT_IMAGE_FORMAT_R32F;
263       case VK_FORMAT_X8_D24_UNORM_PACK32:
264          return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
265       case VK_FORMAT_D24_UNORM_S8_UINT:
266          /* When storing the stencil aspect of a combined depth/stencil image
267           * to a buffer, the Vulkan spec states that the output buffer must
268           * have packed stencil values, so we choose an R8UI format for our
269           * store outputs. For the load input we still want RGBA8UI since the
270           * source image contains 4 channels (including the 3 channels
271           * containing the 24-bit depth value).
272           *
273           * When loading the stencil aspect of a combined depth/stencil image
274           * from a buffer, we read packed 8-bit stencil values from the buffer
275           * that we need to put into the LSB of the 32-bit format (the R
276           * channel), so we use R8UI. For the store, if we used R8UI then we
277           * would write 8-bit stencil values consecutively over depth channels,
278           * so we need to use RGBA8UI. This will write each stencil value in
279           * its correct position, but will overwrite depth values (channels G
280           * B,A) with undefined values. To fix this,  we will have to restore
281           * the depth aspect from the Z tile buffer, which we should pre-load
282           * from the image before the store).
283           */
284          if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT) {
285             return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
286          } else {
287             assert(aspect & VK_IMAGE_ASPECT_STENCIL_BIT);
288             if (is_copy_to_buffer) {
289                return for_store ? V3D_OUTPUT_IMAGE_FORMAT_R8UI :
290                                   V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
291             } else {
292                assert(is_copy_from_buffer);
293                return for_store ? V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI :
294                                   V3D_OUTPUT_IMAGE_FORMAT_R8UI;
295             }
296          }
297       default: /* Color formats */
298          return framebuffer->format->rt_type;
299          break;
300       }
301    } else {
302       return framebuffer->format->rt_type;
303    }
304 }
305 
306 static inline bool
format_needs_rb_swap(struct v3dv_device * device,VkFormat format)307 format_needs_rb_swap(struct v3dv_device *device,
308                      VkFormat format)
309 {
310    const uint8_t *swizzle = v3dv_get_format_swizzle(device, format);
311    return swizzle[0] == PIPE_SWIZZLE_Z;
312 }
313 
314 static void
emit_image_load(struct v3dv_device * device,struct v3dv_cl * cl,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * image,VkImageAspectFlags aspect,uint32_t layer,uint32_t mip_level,bool is_copy_to_buffer,bool is_copy_from_buffer)315 emit_image_load(struct v3dv_device *device,
316                 struct v3dv_cl *cl,
317                 struct v3dv_meta_framebuffer *framebuffer,
318                 struct v3dv_image *image,
319                 VkImageAspectFlags aspect,
320                 uint32_t layer,
321                 uint32_t mip_level,
322                 bool is_copy_to_buffer,
323                 bool is_copy_from_buffer)
324 {
325    uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
326 
327    /* For image to/from buffer copies we always load to and store from RT0,
328     * even for depth/stencil aspects, because the hardware can't do raster
329     * stores or loads from/to the depth/stencil tile buffers.
330     */
331    bool load_to_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
332                             aspect == VK_IMAGE_ASPECT_COLOR_BIT;
333 
334    const struct v3d_resource_slice *slice = &image->slices[mip_level];
335    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
336       load.buffer_to_load = load_to_color_tlb ?
337          RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
338 
339       load.address = v3dv_cl_address(image->mem->bo, layer_offset);
340 
341       load.input_image_format = choose_tlb_format(framebuffer, aspect, false,
342                                                   is_copy_to_buffer,
343                                                   is_copy_from_buffer);
344       load.memory_format = slice->tiling;
345 
346       /* When copying depth/stencil images to a buffer, for D24 formats Vulkan
347        * expects the depth value in the LSB bits of each 32-bit pixel.
348        * Unfortunately, the hardware seems to put the S8/X8 bits there and the
349        * depth bits on the MSB. To work around that we can reverse the channel
350        * order and then swap the R/B channels to get what we want.
351        *
352        * NOTE: reversing and swapping only gets us the behavior we want if the
353        * operations happen in that exact order, which seems to be the case when
354        * done on the tile buffer load operations. On the store, it seems the
355        * order is not the same. The order on the store is probably reversed so
356        * that reversing and swapping on both the load and the store preserves
357        * the original order of the channels in memory.
358        *
359        * Notice that we only need to do this when copying to a buffer, where
360        * depth and stencil aspects are copied as separate regions and
361        * the spec expects them to be tightly packed.
362        */
363       bool needs_rb_swap = false;
364       bool needs_chan_reverse = false;
365       if (is_copy_to_buffer &&
366          (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
367           (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
368            (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
369          needs_rb_swap = true;
370          needs_chan_reverse = true;
371       } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
372                  (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
373          /* This is not a raw data copy (i.e. we are clearing the image),
374           * so we need to make sure we respect the format swizzle.
375           */
376          needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
377       }
378 
379       load.r_b_swap = needs_rb_swap;
380       load.channel_reverse = needs_chan_reverse;
381 
382       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
383           slice->tiling == V3D_TILING_UIF_XOR) {
384          load.height_in_ub_or_stride =
385             slice->padded_height_of_output_image_in_uif_blocks;
386       } else if (slice->tiling == V3D_TILING_RASTER) {
387          load.height_in_ub_or_stride = slice->stride;
388       }
389 
390       if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
391          load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
392       else
393          load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
394    }
395 }
396 
397 static void
emit_image_store(struct v3dv_device * device,struct v3dv_cl * cl,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * image,VkImageAspectFlags aspect,uint32_t layer,uint32_t mip_level,bool is_copy_to_buffer,bool is_copy_from_buffer,bool is_multisample_resolve)398 emit_image_store(struct v3dv_device *device,
399                  struct v3dv_cl *cl,
400                  struct v3dv_meta_framebuffer *framebuffer,
401                  struct v3dv_image *image,
402                  VkImageAspectFlags aspect,
403                  uint32_t layer,
404                  uint32_t mip_level,
405                  bool is_copy_to_buffer,
406                  bool is_copy_from_buffer,
407                  bool is_multisample_resolve)
408 {
409    uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
410 
411    bool store_from_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
412                                aspect == VK_IMAGE_ASPECT_COLOR_BIT;
413 
414    const struct v3d_resource_slice *slice = &image->slices[mip_level];
415    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
416       store.buffer_to_store = store_from_color_tlb ?
417          RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
418 
419       store.address = v3dv_cl_address(image->mem->bo, layer_offset);
420       store.clear_buffer_being_stored = false;
421 
422       /* See rationale in emit_image_load() */
423       bool needs_rb_swap = false;
424       bool needs_chan_reverse = false;
425       if (is_copy_from_buffer &&
426          (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
427           (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
428            (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
429          needs_rb_swap = true;
430          needs_chan_reverse = true;
431       } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
432                  (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
433          needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
434       }
435 
436       store.r_b_swap = needs_rb_swap;
437       store.channel_reverse = needs_chan_reverse;
438 
439       store.output_image_format = choose_tlb_format(framebuffer, aspect, true,
440                                                     is_copy_to_buffer,
441                                                     is_copy_from_buffer);
442       store.memory_format = slice->tiling;
443       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
444           slice->tiling == V3D_TILING_UIF_XOR) {
445          store.height_in_ub_or_stride =
446             slice->padded_height_of_output_image_in_uif_blocks;
447       } else if (slice->tiling == V3D_TILING_RASTER) {
448          store.height_in_ub_or_stride = slice->stride;
449       }
450 
451       if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
452          store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
453       else if (is_multisample_resolve)
454          store.decimate_mode = V3D_DECIMATE_MODE_4X;
455       else
456          store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
457    }
458 }
459 
460 static void
emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_buffer * buffer,struct v3dv_image * image,uint32_t layer_offset,const VkBufferImageCopy2KHR * region)461 emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
462                                         struct v3dv_meta_framebuffer *framebuffer,
463                                         struct v3dv_buffer *buffer,
464                                         struct v3dv_image *image,
465                                         uint32_t layer_offset,
466                                         const VkBufferImageCopy2KHR *region)
467 {
468    struct v3dv_cl *cl = &job->indirect;
469    v3dv_cl_ensure_space(cl, 200, 1);
470    v3dv_return_if_oom(NULL, job);
471 
472    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
473 
474    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
475 
476    /* Load image to TLB */
477    assert((image->vk.image_type != VK_IMAGE_TYPE_3D &&
478            layer_offset < region->imageSubresource.layerCount) ||
479           layer_offset < image->vk.extent.depth);
480 
481    const uint32_t image_layer = image->vk.image_type != VK_IMAGE_TYPE_3D ?
482       region->imageSubresource.baseArrayLayer + layer_offset :
483       region->imageOffset.z + layer_offset;
484 
485    emit_image_load(job->device, cl, framebuffer, image,
486                    region->imageSubresource.aspectMask,
487                    image_layer,
488                    region->imageSubresource.mipLevel,
489                    true, false);
490 
491    cl_emit(cl, END_OF_LOADS, end);
492 
493    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
494 
495    /* Store TLB to buffer */
496    uint32_t width, height;
497    if (region->bufferRowLength == 0)
498       width = region->imageExtent.width;
499    else
500       width = region->bufferRowLength;
501 
502    if (region->bufferImageHeight == 0)
503       height = region->imageExtent.height;
504    else
505       height = region->bufferImageHeight;
506 
507    /* Handle copy from compressed format */
508    width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
509    height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
510 
511    /* If we are storing stencil from a combined depth/stencil format the
512     * Vulkan spec states that the output buffer must have packed stencil
513     * values, where each stencil value is 1 byte.
514     */
515    uint32_t cpp =
516       region->imageSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
517          1 : image->cpp;
518    uint32_t buffer_stride = width * cpp;
519    uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset +
520                             height * buffer_stride * layer_offset;
521 
522    uint32_t format = choose_tlb_format(framebuffer,
523                                        region->imageSubresource.aspectMask,
524                                        true, true, false);
525    bool msaa = image->vk.samples > VK_SAMPLE_COUNT_1_BIT;
526 
527    emit_linear_store(cl, RENDER_TARGET_0, buffer->mem->bo,
528                      buffer_offset, buffer_stride, msaa, format);
529 
530    cl_emit(cl, END_OF_TILE_MARKER, end);
531 
532    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
533 
534    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
535       branch.start = tile_list_start;
536       branch.end = v3dv_cl_get_address(cl);
537    }
538 }
539 
540 static void
emit_copy_layer_to_buffer(struct v3dv_job * job,struct v3dv_buffer * buffer,struct v3dv_image * image,struct v3dv_meta_framebuffer * framebuffer,uint32_t layer,const VkBufferImageCopy2KHR * region)541 emit_copy_layer_to_buffer(struct v3dv_job *job,
542                           struct v3dv_buffer *buffer,
543                           struct v3dv_image *image,
544                           struct v3dv_meta_framebuffer *framebuffer,
545                           uint32_t layer,
546                           const VkBufferImageCopy2KHR *region)
547 {
548    emit_copy_layer_to_buffer_per_tile_list(job, framebuffer, buffer,
549                                            image, layer, region);
550    emit_supertile_coordinates(job, framebuffer);
551 }
552 
553 void
v3dX(meta_emit_copy_image_to_buffer_rcl)554 v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job,
555                                          struct v3dv_buffer *buffer,
556                                          struct v3dv_image *image,
557                                          struct v3dv_meta_framebuffer *framebuffer,
558                                          const VkBufferImageCopy2KHR *region)
559 {
560    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
561    v3dv_return_if_oom(NULL, job);
562 
563    emit_frame_setup(job, 0, NULL);
564    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
565       emit_copy_layer_to_buffer(job, buffer, image, framebuffer, layer, region);
566    cl_emit(rcl, END_OF_RENDERING, end);
567 }
568 
569 static void
emit_resolve_image_layer_per_tile_list(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * dst,struct v3dv_image * src,uint32_t layer_offset,const VkImageResolve2KHR * region)570 emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
571                                        struct v3dv_meta_framebuffer *framebuffer,
572                                        struct v3dv_image *dst,
573                                        struct v3dv_image *src,
574                                        uint32_t layer_offset,
575                                        const VkImageResolve2KHR *region)
576 {
577    struct v3dv_cl *cl = &job->indirect;
578    v3dv_cl_ensure_space(cl, 200, 1);
579    v3dv_return_if_oom(NULL, job);
580 
581    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
582 
583    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
584 
585    assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
586            layer_offset < region->srcSubresource.layerCount) ||
587           layer_offset < src->vk.extent.depth);
588 
589    const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
590       region->srcSubresource.baseArrayLayer + layer_offset :
591       region->srcOffset.z + layer_offset;
592 
593    emit_image_load(job->device, cl, framebuffer, src,
594                    region->srcSubresource.aspectMask,
595                    src_layer,
596                    region->srcSubresource.mipLevel,
597                    false, false);
598 
599    cl_emit(cl, END_OF_LOADS, end);
600 
601    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
602 
603    assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
604            layer_offset < region->dstSubresource.layerCount) ||
605           layer_offset < dst->vk.extent.depth);
606 
607    const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
608       region->dstSubresource.baseArrayLayer + layer_offset :
609       region->dstOffset.z + layer_offset;
610 
611    emit_image_store(job->device, cl, framebuffer, dst,
612                     region->dstSubresource.aspectMask,
613                     dst_layer,
614                     region->dstSubresource.mipLevel,
615                     false, false, true);
616 
617    cl_emit(cl, END_OF_TILE_MARKER, end);
618 
619    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
620 
621    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
622       branch.start = tile_list_start;
623       branch.end = v3dv_cl_get_address(cl);
624    }
625 }
626 
627 static void
emit_resolve_image_layer(struct v3dv_job * job,struct v3dv_image * dst,struct v3dv_image * src,struct v3dv_meta_framebuffer * framebuffer,uint32_t layer,const VkImageResolve2KHR * region)628 emit_resolve_image_layer(struct v3dv_job *job,
629                          struct v3dv_image *dst,
630                          struct v3dv_image *src,
631                          struct v3dv_meta_framebuffer *framebuffer,
632                          uint32_t layer,
633                          const VkImageResolve2KHR *region)
634 {
635    emit_resolve_image_layer_per_tile_list(job, framebuffer,
636                                           dst, src, layer, region);
637    emit_supertile_coordinates(job, framebuffer);
638 }
639 
640 void
v3dX(meta_emit_resolve_image_rcl)641 v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job,
642                                   struct v3dv_image *dst,
643                                   struct v3dv_image *src,
644                                   struct v3dv_meta_framebuffer *framebuffer,
645                                   const VkImageResolve2KHR *region)
646 {
647    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
648    v3dv_return_if_oom(NULL, job);
649 
650    emit_frame_setup(job, 0, NULL);
651    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
652       emit_resolve_image_layer(job, dst, src, framebuffer, layer, region);
653    cl_emit(rcl, END_OF_RENDERING, end);
654 }
655 
656 static void
emit_copy_buffer_per_tile_list(struct v3dv_job * job,struct v3dv_bo * dst,struct v3dv_bo * src,uint32_t dst_offset,uint32_t src_offset,uint32_t stride,uint32_t format)657 emit_copy_buffer_per_tile_list(struct v3dv_job *job,
658                                struct v3dv_bo *dst,
659                                struct v3dv_bo *src,
660                                uint32_t dst_offset,
661                                uint32_t src_offset,
662                                uint32_t stride,
663                                uint32_t format)
664 {
665    struct v3dv_cl *cl = &job->indirect;
666    v3dv_cl_ensure_space(cl, 200, 1);
667    v3dv_return_if_oom(NULL, job);
668 
669    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
670 
671    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
672 
673    emit_linear_load(cl, RENDER_TARGET_0, src, src_offset, stride, format);
674 
675    cl_emit(cl, END_OF_LOADS, end);
676 
677    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
678 
679    emit_linear_store(cl, RENDER_TARGET_0,
680                      dst, dst_offset, stride, false, format);
681 
682    cl_emit(cl, END_OF_TILE_MARKER, end);
683 
684    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
685 
686    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
687       branch.start = tile_list_start;
688       branch.end = v3dv_cl_get_address(cl);
689    }
690 }
691 
692 void
v3dX(meta_emit_copy_buffer)693 v3dX(meta_emit_copy_buffer)(struct v3dv_job *job,
694                             struct v3dv_bo *dst,
695                             struct v3dv_bo *src,
696                             uint32_t dst_offset,
697                             uint32_t src_offset,
698                             struct v3dv_meta_framebuffer *framebuffer,
699                             uint32_t format,
700                             uint32_t item_size)
701 {
702    const uint32_t stride = job->frame_tiling.width * item_size;
703    emit_copy_buffer_per_tile_list(job, dst, src,
704                                   dst_offset, src_offset,
705                                   stride, format);
706    emit_supertile_coordinates(job, framebuffer);
707 }
708 
709 void
v3dX(meta_emit_copy_buffer_rcl)710 v3dX(meta_emit_copy_buffer_rcl)(struct v3dv_job *job,
711                                 struct v3dv_bo *dst,
712                                 struct v3dv_bo *src,
713                                 uint32_t dst_offset,
714                                 uint32_t src_offset,
715                                 struct v3dv_meta_framebuffer *framebuffer,
716                                 uint32_t format,
717                                 uint32_t item_size)
718 {
719    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
720    v3dv_return_if_oom(NULL, job);
721 
722    emit_frame_setup(job, 0, NULL);
723 
724    v3dX(meta_emit_copy_buffer)(job, dst, src, dst_offset, src_offset,
725                                framebuffer, format, item_size);
726 
727    cl_emit(rcl, END_OF_RENDERING, end);
728 }
729 
730 static void
emit_copy_image_layer_per_tile_list(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * dst,struct v3dv_image * src,uint32_t layer_offset,const VkImageCopy2KHR * region)731 emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
732                                     struct v3dv_meta_framebuffer *framebuffer,
733                                     struct v3dv_image *dst,
734                                     struct v3dv_image *src,
735                                     uint32_t layer_offset,
736                                     const VkImageCopy2KHR *region)
737 {
738    struct v3dv_cl *cl = &job->indirect;
739    v3dv_cl_ensure_space(cl, 200, 1);
740    v3dv_return_if_oom(NULL, job);
741 
742    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
743 
744    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
745 
746    assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
747            layer_offset < region->srcSubresource.layerCount) ||
748           layer_offset < src->vk.extent.depth);
749 
750    const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
751       region->srcSubresource.baseArrayLayer + layer_offset :
752       region->srcOffset.z + layer_offset;
753 
754    emit_image_load(job->device, cl, framebuffer, src,
755                    region->srcSubresource.aspectMask,
756                    src_layer,
757                    region->srcSubresource.mipLevel,
758                    false, false);
759 
760    cl_emit(cl, END_OF_LOADS, end);
761 
762    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
763 
764    assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
765            layer_offset < region->dstSubresource.layerCount) ||
766           layer_offset < dst->vk.extent.depth);
767 
768    const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
769       region->dstSubresource.baseArrayLayer + layer_offset :
770       region->dstOffset.z + layer_offset;
771 
772    emit_image_store(job->device, cl, framebuffer, dst,
773                     region->dstSubresource.aspectMask,
774                     dst_layer,
775                     region->dstSubresource.mipLevel,
776                     false, false, false);
777 
778    cl_emit(cl, END_OF_TILE_MARKER, end);
779 
780    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
781 
782    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
783       branch.start = tile_list_start;
784       branch.end = v3dv_cl_get_address(cl);
785    }
786 }
787 
788 static void
emit_copy_image_layer(struct v3dv_job * job,struct v3dv_image * dst,struct v3dv_image * src,struct v3dv_meta_framebuffer * framebuffer,uint32_t layer,const VkImageCopy2KHR * region)789 emit_copy_image_layer(struct v3dv_job *job,
790                       struct v3dv_image *dst,
791                       struct v3dv_image *src,
792                       struct v3dv_meta_framebuffer *framebuffer,
793                       uint32_t layer,
794                       const VkImageCopy2KHR *region)
795 {
796    emit_copy_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region);
797    emit_supertile_coordinates(job, framebuffer);
798 }
799 
800 void
v3dX(meta_emit_copy_image_rcl)801 v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
802                                struct v3dv_image *dst,
803                                struct v3dv_image *src,
804                                struct v3dv_meta_framebuffer *framebuffer,
805                                const VkImageCopy2KHR *region)
806 {
807    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
808    v3dv_return_if_oom(NULL, job);
809 
810    emit_frame_setup(job, 0, NULL);
811    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
812       emit_copy_image_layer(job, dst, src, framebuffer, layer, region);
813    cl_emit(rcl, END_OF_RENDERING, end);
814 }
815 
816 void
v3dX(meta_emit_tfu_job)817 v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
818                         struct v3dv_image *dst,
819                         uint32_t dst_mip_level,
820                         uint32_t dst_layer,
821                         struct v3dv_image *src,
822                         uint32_t src_mip_level,
823                         uint32_t src_layer,
824                         uint32_t width,
825                         uint32_t height,
826                         const struct v3dv_format *format)
827 {
828    const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
829    const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
830 
831    assert(dst->mem && dst->mem->bo);
832    const struct v3dv_bo *dst_bo = dst->mem->bo;
833 
834    assert(src->mem && src->mem->bo);
835    const struct v3dv_bo *src_bo = src->mem->bo;
836 
837    struct drm_v3d_submit_tfu tfu = {
838       .ios = (height << 16) | width,
839       .bo_handles = {
840          dst_bo->handle,
841          src_bo->handle != dst_bo->handle ? src_bo->handle : 0
842       },
843    };
844 
845    const uint32_t src_offset =
846       src_bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer);
847    tfu.iia |= src_offset;
848 
849    uint32_t icfg;
850    if (src_slice->tiling == V3D_TILING_RASTER) {
851       icfg = V3D_TFU_ICFG_FORMAT_RASTER;
852    } else {
853       icfg = V3D_TFU_ICFG_FORMAT_LINEARTILE +
854              (src_slice->tiling - V3D_TILING_LINEARTILE);
855    }
856    tfu.icfg |= icfg << V3D_TFU_ICFG_FORMAT_SHIFT;
857 
858    const uint32_t dst_offset =
859       dst_bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer);
860    tfu.ioa |= dst_offset;
861 
862    tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
863                (dst_slice->tiling - V3D_TILING_LINEARTILE)) <<
864                 V3D_TFU_IOA_FORMAT_SHIFT;
865    tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
866 
867    switch (src_slice->tiling) {
868    case V3D_TILING_UIF_NO_XOR:
869    case V3D_TILING_UIF_XOR:
870       tfu.iis |= src_slice->padded_height / (2 * v3d_utile_height(src->cpp));
871       break;
872    case V3D_TILING_RASTER:
873       tfu.iis |= src_slice->stride / src->cpp;
874       break;
875    default:
876       break;
877    }
878 
879    /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
880     * OPAD field for the destination (how many extra UIF blocks beyond
881     * those necessary to cover the height).
882     */
883    if (dst_slice->tiling == V3D_TILING_UIF_NO_XOR ||
884        dst_slice->tiling == V3D_TILING_UIF_XOR) {
885       uint32_t uif_block_h = 2 * v3d_utile_height(dst->cpp);
886       uint32_t implicit_padded_height = align(height, uif_block_h);
887       uint32_t icfg =
888          (dst_slice->padded_height - implicit_padded_height) / uif_block_h;
889       tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
890    }
891 
892    v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
893 }
894 
895 static void
emit_clear_image_layer_per_tile_list(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * image,VkImageAspectFlags aspects,uint32_t layer,uint32_t level)896 emit_clear_image_layer_per_tile_list(struct v3dv_job *job,
897                                      struct v3dv_meta_framebuffer *framebuffer,
898                                      struct v3dv_image *image,
899                                      VkImageAspectFlags aspects,
900                                      uint32_t layer,
901                                      uint32_t level)
902 {
903    struct v3dv_cl *cl = &job->indirect;
904    v3dv_cl_ensure_space(cl, 200, 1);
905    v3dv_return_if_oom(NULL, job);
906 
907    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
908 
909    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
910 
911    cl_emit(cl, END_OF_LOADS, end);
912 
913    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
914 
915    emit_image_store(job->device, cl, framebuffer, image, aspects,
916                     layer, level, false, false, false);
917 
918    cl_emit(cl, END_OF_TILE_MARKER, end);
919 
920    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
921 
922    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
923       branch.start = tile_list_start;
924       branch.end = v3dv_cl_get_address(cl);
925    }
926 }
927 
928 static void
emit_clear_image_layers(struct v3dv_job * job,struct v3dv_image * image,struct v3dv_meta_framebuffer * framebuffer,VkImageAspectFlags aspects,uint32_t min_layer,uint32_t max_layer,uint32_t level)929 emit_clear_image_layers(struct v3dv_job *job,
930                  struct v3dv_image *image,
931                  struct v3dv_meta_framebuffer *framebuffer,
932                  VkImageAspectFlags aspects,
933                  uint32_t min_layer,
934                  uint32_t max_layer,
935                  uint32_t level)
936 {
937    for (uint32_t layer = min_layer; layer < max_layer; layer++) {
938       emit_clear_image_layer_per_tile_list(job, framebuffer, image, aspects,
939                                            layer, level);
940       emit_supertile_coordinates(job, framebuffer);
941    }
942 }
943 
944 void
v3dX(meta_emit_clear_image_rcl)945 v3dX(meta_emit_clear_image_rcl)(struct v3dv_job *job,
946                                 struct v3dv_image *image,
947                                 struct v3dv_meta_framebuffer *framebuffer,
948                                 const union v3dv_clear_value *clear_value,
949                                 VkImageAspectFlags aspects,
950                                 uint32_t min_layer,
951                                 uint32_t max_layer,
952                                 uint32_t level)
953 {
954    const struct rcl_clear_info clear_info = {
955       .clear_value = clear_value,
956       .image = image,
957       .aspects = aspects,
958       .level = level,
959    };
960 
961    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
962    v3dv_return_if_oom(NULL, job);
963 
964    emit_frame_setup(job, 0, clear_value);
965    emit_clear_image_layers(job, image, framebuffer, aspects,
966                            min_layer, max_layer, level);
967    cl_emit(rcl, END_OF_RENDERING, end);
968 }
969 
970 static void
emit_fill_buffer_per_tile_list(struct v3dv_job * job,struct v3dv_bo * bo,uint32_t offset,uint32_t stride)971 emit_fill_buffer_per_tile_list(struct v3dv_job *job,
972                                struct v3dv_bo *bo,
973                                uint32_t offset,
974                                uint32_t stride)
975 {
976    struct v3dv_cl *cl = &job->indirect;
977    v3dv_cl_ensure_space(cl, 200, 1);
978    v3dv_return_if_oom(NULL, job);
979 
980    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
981 
982    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
983 
984    cl_emit(cl, END_OF_LOADS, end);
985 
986    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
987 
988    emit_linear_store(cl, RENDER_TARGET_0, bo, offset, stride, false,
989                      V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI);
990 
991    cl_emit(cl, END_OF_TILE_MARKER, end);
992 
993    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
994 
995    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
996       branch.start = tile_list_start;
997       branch.end = v3dv_cl_get_address(cl);
998    }
999 }
1000 
1001 static void
emit_fill_buffer(struct v3dv_job * job,struct v3dv_bo * bo,uint32_t offset,struct v3dv_meta_framebuffer * framebuffer)1002 emit_fill_buffer(struct v3dv_job *job,
1003                  struct v3dv_bo *bo,
1004                  uint32_t offset,
1005                  struct v3dv_meta_framebuffer *framebuffer)
1006 {
1007    const uint32_t stride = job->frame_tiling.width * 4;
1008    emit_fill_buffer_per_tile_list(job, bo, offset, stride);
1009    emit_supertile_coordinates(job, framebuffer);
1010 }
1011 
1012 void
v3dX(meta_emit_fill_buffer_rcl)1013 v3dX(meta_emit_fill_buffer_rcl)(struct v3dv_job *job,
1014                                 struct v3dv_bo *bo,
1015                                 uint32_t offset,
1016                                 struct v3dv_meta_framebuffer *framebuffer,
1017                                 uint32_t data)
1018 {
1019    const union v3dv_clear_value clear_value = {
1020        .color = { data, 0, 0, 0 },
1021    };
1022 
1023    const struct rcl_clear_info clear_info = {
1024       .clear_value = &clear_value,
1025       .image = NULL,
1026       .aspects = VK_IMAGE_ASPECT_COLOR_BIT,
1027       .level = 0,
1028    };
1029 
1030    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
1031    v3dv_return_if_oom(NULL, job);
1032 
1033    emit_frame_setup(job, 0, &clear_value);
1034    emit_fill_buffer(job, bo, offset, framebuffer);
1035    cl_emit(rcl, END_OF_RENDERING, end);
1036 }
1037 
1038 
1039 static void
emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * image,struct v3dv_buffer * buffer,uint32_t layer,const VkBufferImageCopy2KHR * region)1040 emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
1041                                         struct v3dv_meta_framebuffer *framebuffer,
1042                                         struct v3dv_image *image,
1043                                         struct v3dv_buffer *buffer,
1044                                         uint32_t layer,
1045                                         const VkBufferImageCopy2KHR *region)
1046 {
1047    struct v3dv_cl *cl = &job->indirect;
1048    v3dv_cl_ensure_space(cl, 200, 1);
1049    v3dv_return_if_oom(NULL, job);
1050 
1051    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
1052 
1053    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
1054 
1055    const VkImageSubresourceLayers *imgrsc = &region->imageSubresource;
1056    assert((image->vk.image_type != VK_IMAGE_TYPE_3D && layer < imgrsc->layerCount) ||
1057           layer < image->vk.extent.depth);
1058 
1059    /* Load TLB from buffer */
1060    uint32_t width, height;
1061    if (region->bufferRowLength == 0)
1062       width = region->imageExtent.width;
1063    else
1064       width = region->bufferRowLength;
1065 
1066    if (region->bufferImageHeight == 0)
1067       height = region->imageExtent.height;
1068    else
1069       height = region->bufferImageHeight;
1070 
1071    /* Handle copy to compressed format using a compatible format */
1072    width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
1073    height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
1074 
1075    uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
1076                   1 : image->cpp;
1077    uint32_t buffer_stride = width * cpp;
1078    uint32_t buffer_offset =
1079       buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer;
1080 
1081    uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask,
1082                                        false, false, true);
1083 
1084    emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo,
1085                     buffer_offset, buffer_stride, format);
1086 
1087    /* Because we can't do raster loads/stores of Z/S formats we need to
1088     * use a color tile buffer with a compatible RGBA color format instead.
1089     * However, when we are uploading a single aspect to a combined
1090     * depth/stencil image we have the problem that our tile buffer stores don't
1091     * allow us to mask out the other aspect, so we always write all four RGBA
1092     * channels to the image and we end up overwriting that other aspect with
1093     * undefined values. To work around that, we first load the aspect we are
1094     * not copying from the image memory into a proper Z/S tile buffer. Then we
1095     * do our store from the color buffer for the aspect we are copying, and
1096     * after that, we do another store from the Z/S tile buffer to restore the
1097     * other aspect to its original value.
1098     */
1099    if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1100       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1101          emit_image_load(job->device, cl, framebuffer, image,
1102                          VK_IMAGE_ASPECT_STENCIL_BIT,
1103                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
1104                          false, false);
1105       } else {
1106          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
1107          emit_image_load(job->device, cl, framebuffer, image,
1108                          VK_IMAGE_ASPECT_DEPTH_BIT,
1109                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
1110                          false, false);
1111       }
1112    }
1113 
1114    cl_emit(cl, END_OF_LOADS, end);
1115 
1116    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
1117 
1118    /* Store TLB to image */
1119    emit_image_store(job->device, cl, framebuffer, image, imgrsc->aspectMask,
1120                     imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
1121                     false, true, false);
1122 
1123    if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1124       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1125          emit_image_store(job->device, cl, framebuffer, image,
1126                           VK_IMAGE_ASPECT_STENCIL_BIT,
1127                           imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
1128                           false, false, false);
1129       } else {
1130          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
1131          emit_image_store(job->device, cl, framebuffer, image,
1132                           VK_IMAGE_ASPECT_DEPTH_BIT,
1133                           imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
1134                           false, false, false);
1135       }
1136    }
1137 
1138    cl_emit(cl, END_OF_TILE_MARKER, end);
1139 
1140    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1141 
1142    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1143       branch.start = tile_list_start;
1144       branch.end = v3dv_cl_get_address(cl);
1145    }
1146 }
1147 
1148 static void
emit_copy_buffer_to_layer(struct v3dv_job * job,struct v3dv_image * image,struct v3dv_buffer * buffer,struct v3dv_meta_framebuffer * framebuffer,uint32_t layer,const VkBufferImageCopy2KHR * region)1149 emit_copy_buffer_to_layer(struct v3dv_job *job,
1150                           struct v3dv_image *image,
1151                           struct v3dv_buffer *buffer,
1152                           struct v3dv_meta_framebuffer *framebuffer,
1153                           uint32_t layer,
1154                           const VkBufferImageCopy2KHR *region)
1155 {
1156    emit_copy_buffer_to_layer_per_tile_list(job, framebuffer, image, buffer,
1157                                            layer, region);
1158    emit_supertile_coordinates(job, framebuffer);
1159 }
1160 
1161 void
v3dX(meta_emit_copy_buffer_to_image_rcl)1162 v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
1163                                          struct v3dv_image *image,
1164                                          struct v3dv_buffer *buffer,
1165                                          struct v3dv_meta_framebuffer *framebuffer,
1166                                          const VkBufferImageCopy2KHR *region)
1167 {
1168    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
1169    v3dv_return_if_oom(NULL, job);
1170 
1171    emit_frame_setup(job, 0, NULL);
1172    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
1173       emit_copy_buffer_to_layer(job, image, buffer, framebuffer, layer, region);
1174    cl_emit(rcl, END_OF_RENDERING, end);
1175 }
1176 
1177 /* Figure out a TLB size configuration for a number of pixels to process.
1178  * Beware that we can't "render" more than 4096x4096 pixels in a single job,
1179  * if the pixel count is larger than this, the caller might need to split
1180  * the job and call this function multiple times.
1181  */
1182 static void
framebuffer_size_for_pixel_count(uint32_t num_pixels,uint32_t * width,uint32_t * height)1183 framebuffer_size_for_pixel_count(uint32_t num_pixels,
1184                                  uint32_t *width,
1185                                  uint32_t *height)
1186 {
1187    assert(num_pixels > 0);
1188 
1189    const uint32_t max_dim_pixels = 4096;
1190    const uint32_t max_pixels = max_dim_pixels * max_dim_pixels;
1191 
1192    uint32_t w, h;
1193    if (num_pixels > max_pixels) {
1194       w = max_dim_pixels;
1195       h = max_dim_pixels;
1196    } else {
1197       w = num_pixels;
1198       h = 1;
1199       while (w > max_dim_pixels || ((w % 2) == 0 && w > 2 * h)) {
1200          w >>= 1;
1201          h <<= 1;
1202       }
1203    }
1204    assert(w <= max_dim_pixels && h <= max_dim_pixels);
1205    assert(w * h <= num_pixels);
1206    assert(w > 0 && h > 0);
1207 
1208    *width = w;
1209    *height = h;
1210 }
1211 
1212 struct v3dv_job *
v3dX(meta_copy_buffer)1213 v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
1214                        struct v3dv_bo *dst,
1215                        uint32_t dst_offset,
1216                        struct v3dv_bo *src,
1217                        uint32_t src_offset,
1218                        const VkBufferCopy2KHR *region)
1219 {
1220    const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
1221    const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
1222 
1223    /* Select appropriate pixel format for the copy operation based on the
1224     * size to copy and the alignment of the source and destination offsets.
1225     */
1226    src_offset += region->srcOffset;
1227    dst_offset += region->dstOffset;
1228    uint32_t item_size = 4;
1229    while (item_size > 1 &&
1230           (src_offset % item_size != 0 || dst_offset % item_size != 0)) {
1231       item_size /= 2;
1232    }
1233 
1234    while (item_size > 1 && region->size % item_size != 0)
1235       item_size /= 2;
1236 
1237    assert(region->size % item_size == 0);
1238    uint32_t num_items = region->size / item_size;
1239    assert(num_items > 0);
1240 
1241    uint32_t format;
1242    VkFormat vk_format;
1243    switch (item_size) {
1244    case 4:
1245       format = V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
1246       vk_format = VK_FORMAT_R8G8B8A8_UINT;
1247       break;
1248    case 2:
1249       format = V3D_OUTPUT_IMAGE_FORMAT_RG8UI;
1250       vk_format = VK_FORMAT_R8G8_UINT;
1251       break;
1252    default:
1253       format = V3D_OUTPUT_IMAGE_FORMAT_R8UI;
1254       vk_format = VK_FORMAT_R8_UINT;
1255       break;
1256    }
1257 
1258    struct v3dv_job *job = NULL;
1259    while (num_items > 0) {
1260       job = v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1261       if (!job)
1262          return NULL;
1263 
1264       uint32_t width, height;
1265       framebuffer_size_for_pixel_count(num_items, &width, &height);
1266 
1267       v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
1268 
1269       struct v3dv_meta_framebuffer framebuffer;
1270       v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
1271                                   &job->frame_tiling);
1272 
1273       v3dX(job_emit_binning_flush)(job);
1274 
1275       v3dX(meta_emit_copy_buffer_rcl)(job, dst, src, dst_offset, src_offset,
1276                                       &framebuffer, format, item_size);
1277 
1278       v3dv_cmd_buffer_finish_job(cmd_buffer);
1279 
1280       const uint32_t items_copied = width * height;
1281       const uint32_t bytes_copied = items_copied * item_size;
1282       num_items -= items_copied;
1283       src_offset += bytes_copied;
1284       dst_offset += bytes_copied;
1285    }
1286 
1287    return job;
1288 }
1289 
1290 void
v3dX(meta_fill_buffer)1291 v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
1292                        struct v3dv_bo *bo,
1293                        uint32_t offset,
1294                        uint32_t size,
1295                        uint32_t data)
1296 {
1297    assert(size > 0 && size % 4 == 0);
1298    assert(offset + size <= bo->size);
1299 
1300    const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
1301    const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
1302    uint32_t num_items = size / 4;
1303 
1304    while (num_items > 0) {
1305       struct v3dv_job *job =
1306          v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1307       if (!job)
1308          return;
1309 
1310       uint32_t width, height;
1311       framebuffer_size_for_pixel_count(num_items, &width, &height);
1312 
1313       v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
1314 
1315       struct v3dv_meta_framebuffer framebuffer;
1316       v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
1317                                   internal_type, &job->frame_tiling);
1318 
1319       v3dX(job_emit_binning_flush)(job);
1320 
1321       v3dX(meta_emit_fill_buffer_rcl)(job, bo, offset, &framebuffer, data);
1322 
1323       v3dv_cmd_buffer_finish_job(cmd_buffer);
1324 
1325       const uint32_t items_copied = width * height;
1326       const uint32_t bytes_copied = items_copied * 4;
1327       num_items -= items_copied;
1328       offset += bytes_copied;
1329    }
1330 }
1331 
1332 void
v3dX(meta_framebuffer_init)1333 v3dX(meta_framebuffer_init)(struct v3dv_meta_framebuffer *fb,
1334                             VkFormat vk_format,
1335                             uint32_t internal_type,
1336                             const struct v3dv_frame_tiling *tiling)
1337 {
1338    fb->internal_type = internal_type;
1339 
1340    /* Supertile coverage always starts at 0,0  */
1341    uint32_t supertile_w_in_pixels =
1342       tiling->tile_width * tiling->supertile_width;
1343    uint32_t supertile_h_in_pixels =
1344       tiling->tile_height * tiling->supertile_height;
1345 
1346    fb->min_x_supertile = 0;
1347    fb->min_y_supertile = 0;
1348    fb->max_x_supertile = (tiling->width - 1) / supertile_w_in_pixels;
1349    fb->max_y_supertile = (tiling->height - 1) / supertile_h_in_pixels;
1350 
1351    fb->vk_format = vk_format;
1352    fb->format = v3dX(get_format)(vk_format);
1353 
1354    fb->internal_depth_type = V3D_INTERNAL_TYPE_DEPTH_32F;
1355    if (vk_format_is_depth_or_stencil(vk_format))
1356       fb->internal_depth_type = v3dX(get_internal_depth_type)(vk_format);
1357 }
1358