1 /*
2  * Copyright 2019-2020 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Jonathan Marek <jonathan@marek.ca>
7  */
8 
9 #include "tu_private.h"
10 
11 #include "tu_cs.h"
12 #include "vk_format.h"
13 
14 #include "ir3/ir3_nir.h"
15 
16 #include "util/format_r11g11b10f.h"
17 #include "util/format_rgb9e5.h"
18 #include "util/format_srgb.h"
19 #include "util/half_float.h"
20 #include "compiler/nir/nir_builder.h"
21 
22 #include "tu_tracepoints.h"
23 
24 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)25 tu_pack_float32_for_unorm(float val, int bits)
26 {
27    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
28 }
29 
30 /* r2d_ = BLIT_OP_SCALE operations */
31 
32 static enum a6xx_2d_ifmt
format_to_ifmt(VkFormat format)33 format_to_ifmt(VkFormat format)
34 {
35    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
36        format == VK_FORMAT_X8_D24_UNORM_PACK32)
37       return R2D_UNORM8;
38 
39    /* get_component_bits doesn't work with depth/stencil formats: */
40    if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT)
41       return R2D_FLOAT32;
42    if (format == VK_FORMAT_S8_UINT)
43       return R2D_INT8;
44 
45    /* use the size of the red channel to find the corresponding "ifmt" */
46    bool is_int = vk_format_is_int(format);
47    switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
48    case 4: case 5: case 8:
49       return is_int ? R2D_INT8 : R2D_UNORM8;
50    case 10: case 11:
51       return is_int ? R2D_INT16 : R2D_FLOAT16;
52    case 16:
53       if (vk_format_is_float(format))
54          return R2D_FLOAT16;
55       return is_int ? R2D_INT16 : R2D_FLOAT32;
56    case 32:
57       return is_int ? R2D_INT32 : R2D_FLOAT32;
58     default:
59       unreachable("bad format");
60       return 0;
61    }
62 }
63 
64 static void
r2d_coords(struct tu_cs * cs,const VkOffset2D * dst,const VkOffset2D * src,const VkExtent2D * extent)65 r2d_coords(struct tu_cs *cs,
66            const VkOffset2D *dst,
67            const VkOffset2D *src,
68            const VkExtent2D *extent)
69 {
70    tu_cs_emit_regs(cs,
71       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
72       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
73 
74    if (!src)
75       return;
76 
77    tu_cs_emit_regs(cs,
78                    A6XX_GRAS_2D_SRC_TL_X(src->x),
79                    A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
80                    A6XX_GRAS_2D_SRC_TL_Y(src->y),
81                    A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
82 }
83 
84 static void
r2d_clear_value(struct tu_cs * cs,VkFormat format,const VkClearValue * val)85 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
86 {
87    uint32_t clear_value[4] = {};
88 
89    switch (format) {
90    case VK_FORMAT_X8_D24_UNORM_PACK32:
91    case VK_FORMAT_D24_UNORM_S8_UINT:
92       /* cleared as r8g8b8a8_unorm using special format */
93       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
94       clear_value[1] = clear_value[0] >> 8;
95       clear_value[2] = clear_value[0] >> 16;
96       clear_value[3] = val->depthStencil.stencil;
97       break;
98    case VK_FORMAT_D16_UNORM:
99    case VK_FORMAT_D32_SFLOAT:
100       /* R2D_FLOAT32 */
101       clear_value[0] = fui(val->depthStencil.depth);
102       break;
103    case VK_FORMAT_S8_UINT:
104       clear_value[0] = val->depthStencil.stencil;
105       break;
106    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
107       /* cleared as UINT32 */
108       clear_value[0] = float3_to_rgb9e5(val->color.float32);
109       break;
110    default:
111       assert(!vk_format_is_depth_or_stencil(format));
112       const struct util_format_description *desc = vk_format_description(format);
113       enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
114 
115       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
116                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
117 
118       for (unsigned i = 0; i < desc->nr_channels; i++) {
119          const struct util_format_channel_description *ch = &desc->channel[i];
120          if (ifmt == R2D_UNORM8) {
121             float linear = val->color.float32[i];
122             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
123                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
124 
125             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
126                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
127             else
128                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
129          } else if (ifmt == R2D_FLOAT16) {
130             clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
131          } else {
132             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
133                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
134             clear_value[i] = val->color.uint32[i];
135          }
136       }
137       break;
138    }
139 
140    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
141    tu_cs_emit_array(cs, clear_value, 4);
142 }
143 
144 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)145 r2d_src(struct tu_cmd_buffer *cmd,
146         struct tu_cs *cs,
147         const struct tu_image_view *iview,
148         uint32_t layer,
149         VkFilter filter)
150 {
151    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
152    if (filter != VK_FILTER_NEAREST)
153       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
154 
155    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
156    tu_cs_emit(cs, src_info);
157    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
158    tu_cs_image_ref_2d(cs, iview, layer, true);
159 
160    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);
161    tu_cs_image_flag_ref(cs, iview, layer);
162 }
163 
164 static void
r2d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)165 r2d_src_stencil(struct tu_cmd_buffer *cmd,
166                 struct tu_cs *cs,
167                 const struct tu_image_view *iview,
168                 uint32_t layer,
169                 VkFilter filter)
170 {
171    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
172    tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
173    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
174    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
175    /* SP_PS_2D_SRC_PITCH has shifted pitch field */
176    tu_cs_emit(cs, iview->stencil_PITCH << 9);
177 }
178 
179 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height)180 r2d_src_buffer(struct tu_cmd_buffer *cmd,
181                struct tu_cs *cs,
182                VkFormat vk_format,
183                uint64_t va, uint32_t pitch,
184                uint32_t width, uint32_t height)
185 {
186    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
187 
188    tu_cs_emit_regs(cs,
189                    A6XX_SP_PS_2D_SRC_INFO(
190                       .color_format = format.fmt,
191                       .color_swap = format.swap,
192                       .srgb = vk_format_is_srgb(vk_format),
193                       .unk20 = 1,
194                       .unk22 = 1),
195                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
196                    A6XX_SP_PS_2D_SRC(.qword = va),
197                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
198 }
199 
200 static void
r2d_dst(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)201 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
202 {
203    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
204    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
205    tu_cs_image_ref_2d(cs, iview, layer, false);
206 
207    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
208    tu_cs_image_flag_ref(cs, iview, layer);
209 }
210 
211 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)212 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
213 {
214    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
215    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
216    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
217    tu_cs_emit(cs, iview->stencil_PITCH);
218 }
219 
220 static void
r2d_dst_buffer(struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch)221 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
222 {
223    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
224 
225    tu_cs_emit_regs(cs,
226                    A6XX_RB_2D_DST_INFO(
227                       .color_format = format.fmt,
228                       .color_swap = format.swap,
229                       .srgb = vk_format_is_srgb(vk_format)),
230                    A6XX_RB_2D_DST(.qword = va),
231                    A6XX_RB_2D_DST_PITCH(pitch));
232 }
233 
234 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,bool scissor)235 r2d_setup_common(struct tu_cmd_buffer *cmd,
236                  struct tu_cs *cs,
237                  VkFormat vk_format,
238                  VkImageAspectFlags aspect_mask,
239                  unsigned blit_param,
240                  bool clear,
241                  bool ubwc,
242                  bool scissor)
243 {
244    enum a6xx_format format = tu6_base_format(vk_format);
245    enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format);
246    uint32_t unknown_8c01 = 0;
247 
248    if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
249         vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
250       format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
251    }
252 
253    /* note: the only format with partial clearing is D24S8 */
254    if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
255       /* preserve stencil channel */
256       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
257          unknown_8c01 = 0x08000041;
258       /* preserve depth channels */
259       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
260          unknown_8c01 = 0x00084001;
261    }
262 
263    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
264    tu_cs_emit(cs, unknown_8c01);
265 
266    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
267          .scissor = scissor,
268          .rotate = blit_param,
269          .solid_color = clear,
270          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
271          .color_format = format,
272          .mask = 0xf,
273          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
274       ).value;
275 
276    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
277    tu_cs_emit(cs, blit_cntl);
278 
279    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
280    tu_cs_emit(cs, blit_cntl);
281 
282    if (format == FMT6_10_10_10_2_UNORM_DEST)
283       format = FMT6_16_16_16_16_FLOAT;
284 
285    tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
286          .sint = vk_format_is_sint(vk_format),
287          .uint = vk_format_is_uint(vk_format),
288          .color_format = format,
289          .srgb = vk_format_is_srgb(vk_format),
290          .mask = 0xf));
291 }
292 
293 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)294 r2d_setup(struct tu_cmd_buffer *cmd,
295           struct tu_cs *cs,
296           VkFormat vk_format,
297           VkImageAspectFlags aspect_mask,
298           unsigned blit_param,
299           bool clear,
300           bool ubwc,
301           VkSampleCountFlagBits samples)
302 {
303    assert(samples == VK_SAMPLE_COUNT_1_BIT);
304 
305    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
306 
307    r2d_setup_common(cmd, cs, vk_format, aspect_mask, blit_param, clear, ubwc, false);
308 }
309 
310 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)311 r2d_teardown(struct tu_cmd_buffer *cmd,
312              struct tu_cs *cs)
313 {
314    /* nothing to do here */
315 }
316 
317 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)318 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
319 {
320    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
321    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
322 }
323 
324 /* r3d_ = shader path operations */
325 
326 static nir_ssa_def *
load_const(nir_builder * b,unsigned base,unsigned components)327 load_const(nir_builder *b, unsigned base, unsigned components)
328 {
329    return nir_load_uniform(b, components, 32, nir_imm_int(b, 0),
330                            .base = base);
331 }
332 
333 static nir_shader *
build_blit_vs_shader(void)334 build_blit_vs_shader(void)
335 {
336    nir_builder _b =
337       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
338    nir_builder *b = &_b;
339 
340    nir_variable *out_pos =
341       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
342                           "gl_Position");
343    out_pos->data.location = VARYING_SLOT_POS;
344 
345    nir_ssa_def *vert0_pos = load_const(b, 0, 2);
346    nir_ssa_def *vert1_pos = load_const(b, 4, 2);
347    nir_ssa_def *vertex = nir_load_vertex_id(b);
348 
349    nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
350    pos = nir_vec4(b, nir_channel(b, pos, 0),
351                      nir_channel(b, pos, 1),
352                      nir_imm_float(b, 0.0),
353                      nir_imm_float(b, 1.0));
354 
355    nir_store_var(b, out_pos, pos, 0xf);
356 
357    nir_variable *out_coords =
358       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
359                           "coords");
360    out_coords->data.location = VARYING_SLOT_VAR0;
361 
362    nir_ssa_def *vert0_coords = load_const(b, 2, 2);
363    nir_ssa_def *vert1_coords = load_const(b, 6, 2);
364 
365    /* Only used with "z scale" blit path which uses a 3d texture */
366    nir_ssa_def *z_coord = load_const(b, 8, 1);
367 
368    nir_ssa_def *coords = nir_bcsel(b, nir_i2b1(b, vertex), vert1_coords, vert0_coords);
369    coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
370                      z_coord);
371 
372    nir_store_var(b, out_coords, coords, 0x7);
373 
374    return b->shader;
375 }
376 
377 static nir_shader *
build_clear_vs_shader(void)378 build_clear_vs_shader(void)
379 {
380    nir_builder _b =
381       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
382    nir_builder *b = &_b;
383 
384    nir_variable *out_pos =
385       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
386                           "gl_Position");
387    out_pos->data.location = VARYING_SLOT_POS;
388 
389    nir_ssa_def *vert0_pos = load_const(b, 0, 2);
390    nir_ssa_def *vert1_pos = load_const(b, 4, 2);
391    /* c0.z is used to clear depth */
392    nir_ssa_def *depth = load_const(b, 2, 1);
393    nir_ssa_def *vertex = nir_load_vertex_id(b);
394 
395    nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
396    pos = nir_vec4(b, nir_channel(b, pos, 0),
397                      nir_channel(b, pos, 1),
398                      depth, nir_imm_float(b, 1.0));
399 
400    nir_store_var(b, out_pos, pos, 0xf);
401 
402    nir_variable *out_layer =
403       nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
404                           "gl_Layer");
405    out_layer->data.location = VARYING_SLOT_LAYER;
406    nir_ssa_def *layer = load_const(b, 3, 1);
407    nir_store_var(b, out_layer, layer, 1);
408 
409    return b->shader;
410 }
411 
412 static nir_shader *
build_blit_fs_shader(bool zscale)413 build_blit_fs_shader(bool zscale)
414 {
415    nir_builder _b =
416       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
417                                      zscale ? "zscale blit fs" : "blit fs");
418    nir_builder *b = &_b;
419 
420    nir_variable *out_color =
421       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
422                           "color0");
423    out_color->data.location = FRAG_RESULT_DATA0;
424 
425    unsigned coord_components = zscale ? 3 : 2;
426    nir_variable *in_coords =
427       nir_variable_create(b->shader, nir_var_shader_in,
428                           glsl_vec_type(coord_components),
429                           "coords");
430    in_coords->data.location = VARYING_SLOT_VAR0;
431 
432    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
433    /* Note: since we're just copying data, we rely on the HW ignoring the
434     * dest_type.
435     */
436    tex->dest_type = nir_type_int32;
437    tex->is_array = false;
438    tex->is_shadow = false;
439    tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
440 
441    tex->texture_index = 0;
442    tex->sampler_index = 0;
443 
444    b->shader->info.num_textures = 1;
445    BITSET_SET(b->shader->info.textures_used, 0);
446 
447    tex->src[0].src_type = nir_tex_src_coord;
448    tex->src[0].src = nir_src_for_ssa(nir_load_var(b, in_coords));
449    tex->coord_components = coord_components;
450 
451    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
452    nir_builder_instr_insert(b, &tex->instr);
453 
454    nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
455 
456    return b->shader;
457 }
458 
459 /* We can only read multisample textures via txf_ms, so we need a separate
460  * variant for them.
461  */
462 static nir_shader *
build_ms_copy_fs_shader(void)463 build_ms_copy_fs_shader(void)
464 {
465    nir_builder _b =
466       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
467                                      "multisample copy fs");
468    nir_builder *b = &_b;
469 
470    nir_variable *out_color =
471       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
472                           "color0");
473    out_color->data.location = FRAG_RESULT_DATA0;
474 
475    nir_variable *in_coords =
476       nir_variable_create(b->shader, nir_var_shader_in,
477                           glsl_vec_type(2),
478                           "coords");
479    in_coords->data.location = VARYING_SLOT_VAR0;
480 
481    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
482 
483    tex->op = nir_texop_txf_ms;
484 
485    /* Note: since we're just copying data, we rely on the HW ignoring the
486     * dest_type.
487     */
488    tex->dest_type = nir_type_int32;
489    tex->is_array = false;
490    tex->is_shadow = false;
491    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
492 
493    tex->texture_index = 0;
494    tex->sampler_index = 0;
495 
496    b->shader->info.num_textures = 1;
497    BITSET_SET(b->shader->info.textures_used, 0);
498    BITSET_SET(b->shader->info.textures_used_by_txf, 0);
499 
500    nir_ssa_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
501 
502    tex->src[0].src_type = nir_tex_src_coord;
503    tex->src[0].src = nir_src_for_ssa(coord);
504    tex->coord_components = 2;
505 
506    tex->src[1].src_type = nir_tex_src_ms_index;
507    tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b));
508 
509    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
510    nir_builder_instr_insert(b, &tex->instr);
511 
512    nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
513 
514    return b->shader;
515 }
516 
517 static nir_shader *
build_clear_fs_shader(unsigned mrts)518 build_clear_fs_shader(unsigned mrts)
519 {
520    nir_builder _b =
521       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
522                                      "mrt%u clear fs", mrts);
523    nir_builder *b = &_b;
524 
525    for (unsigned i = 0; i < mrts; i++) {
526       nir_variable *out_color =
527          nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
528                              "color");
529       out_color->data.location = FRAG_RESULT_DATA0 + i;
530 
531       nir_ssa_def *color = load_const(b, 4 * i, 4);
532       nir_store_var(b, out_color, color, 0xf);
533    }
534 
535    return b->shader;
536 }
537 
538 static void
compile_shader(struct tu_device * dev,struct nir_shader * nir,unsigned consts,unsigned * offset,enum global_shader idx)539 compile_shader(struct tu_device *dev, struct nir_shader *nir,
540                unsigned consts, unsigned *offset, enum global_shader idx)
541 {
542    nir->options = ir3_get_compiler_options(dev->compiler);
543 
544    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
545    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
546 
547    ir3_finalize_nir(dev->compiler, nir);
548 
549    struct ir3_shader *sh = ir3_shader_from_nir(dev->compiler, nir,
550                                                align(consts, 4), NULL);
551 
552    struct ir3_shader_key key = {};
553    bool created;
554    struct ir3_shader_variant *so =
555       ir3_shader_get_variant(sh, &key, false, false, &created);
556 
557    struct tu6_global *global = dev->global_bo.map;
558 
559    assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
560    dev->global_shaders[idx] = so;
561    memcpy(&global->shaders[*offset], so->bin,
562           sizeof(uint32_t) * so->info.sizedwords);
563    dev->global_shader_va[idx] = dev->global_bo.iova +
564       gb_offset(shaders[*offset]);
565    *offset += align(so->info.sizedwords, 32);
566 }
567 
568 void
tu_init_clear_blit_shaders(struct tu_device * dev)569 tu_init_clear_blit_shaders(struct tu_device *dev)
570 {
571    unsigned offset = 0;
572    compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
573    compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
574    compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
575    compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
576    compile_shader(dev, build_ms_copy_fs_shader(), 0, &offset, GLOBAL_SH_FS_COPY_MS);
577 
578    for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
579       compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
580                      GLOBAL_SH_FS_CLEAR0 + num_rts);
581    }
582 }
583 
584 void
tu_destroy_clear_blit_shaders(struct tu_device * dev)585 tu_destroy_clear_blit_shaders(struct tu_device *dev)
586 {
587    for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
588       if (dev->global_shaders[i])
589          ir3_shader_destroy(dev->global_shaders[i]->shader);
590    }
591 }
592 
593 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool blit,uint32_t rts_mask,bool z_scale,VkSampleCountFlagBits samples)594 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit,
595            uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
596 {
597    enum global_shader vs_id =
598       blit ? GLOBAL_SH_VS_BLIT : GLOBAL_SH_VS_CLEAR;
599 
600    struct ir3_shader_variant *vs = cmd->device->global_shaders[vs_id];
601    uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
602 
603    enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
604 
605    if (z_scale)
606       fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
607    else if (samples != VK_SAMPLE_COUNT_1_BIT)
608       fs_id = GLOBAL_SH_FS_COPY_MS;
609 
610    unsigned num_rts = util_bitcount(rts_mask);
611    if (!blit)
612       fs_id = GLOBAL_SH_FS_CLEAR0 + num_rts;
613 
614    struct ir3_shader_variant *fs = cmd->device->global_shaders[fs_id];
615    uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
616 
617    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
618          .vs_state = true,
619          .hs_state = true,
620          .ds_state = true,
621          .gs_state = true,
622          .fs_state = true,
623          .cs_state = true,
624          .gfx_ibo = true,
625          .cs_ibo = true,
626          .gfx_shared_const = true,
627          .gfx_bindless = 0x1f,
628          .cs_bindless = 0x1f));
629 
630    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs);
631    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL);
632    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL);
633    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL);
634    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, fs);
635 
636    struct tu_pvtmem_config pvtmem = {};
637    tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
638    tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
639 
640    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
641    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
642 
643    if (cmd->device->physical_device->info->a6xx.has_cp_reg_write) {
644    /* Copy what the blob does here. This will emit an extra 0x3f
645     * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
646     * this is working around yet.
647     */
648    tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
649    tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
650    tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
651    tu_cs_emit(cs, 0);
652    } else {
653       tu_cs_emit_regs(cs, A6XX_PC_MULTIVIEW_CNTL());
654    }
655    tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());
656 
657    tu6_emit_vpc(cs, vs, NULL, NULL, NULL, fs, 0);
658 
659    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
660    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
661    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
662 
663    tu6_emit_fs_inputs(cs, fs);
664 
665    tu_cs_emit_regs(cs,
666                    A6XX_GRAS_CL_CNTL(
667                       .persp_division_disable = 1,
668                       .vp_xform_disable = 1,
669                       .vp_clip_code_ignore = 1,
670                       .clip_disable = 1));
671    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
672 
673    tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL());
674    tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
675 
676    tu_cs_emit_regs(cs,
677                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
678                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
679    tu_cs_emit_regs(cs,
680                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
681                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
682 
683    tu_cs_emit_regs(cs,
684                    A6XX_VFD_INDEX_OFFSET(),
685                    A6XX_VFD_INSTANCE_START_OFFSET());
686 
687    if (rts_mask) {
688       unsigned rts_count = util_last_bit(rts_mask);
689       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
690       unsigned rt = 0;
691       for (unsigned i = 0; i < rts_count; i++) {
692          unsigned regid = 0;
693          if (rts_mask & (1u << i))
694             regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
695          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid));
696       }
697    }
698 
699    cmd->state.line_mode = RECTANGULAR;
700    tu6_emit_msaa(cs, samples, cmd->state.line_mode);
701 }
702 
703 static void
r3d_coords_raw(struct tu_cs * cs,const float * coords)704 r3d_coords_raw(struct tu_cs *cs, const float *coords)
705 {
706    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
707    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
708                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
709                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
710                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
711                   CP_LOAD_STATE6_0_NUM_UNIT(2));
712    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
713    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
714    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
715 }
716 
717 /* z coordinate for "z scale" blit path which uses a 3d texture */
718 static void
r3d_coord_z(struct tu_cs * cs,float z)719 r3d_coord_z(struct tu_cs *cs, float z)
720 {
721    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 4);
722    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(2) |
723                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
724                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
725                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
726                   CP_LOAD_STATE6_0_NUM_UNIT(1));
727    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
728    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
729    tu_cs_emit(cs, fui(z));
730    tu_cs_emit(cs, 0);
731    tu_cs_emit(cs, 0);
732    tu_cs_emit(cs, 0);
733 }
734 
735 static void
r3d_coords(struct tu_cs * cs,const VkOffset2D * dst,const VkOffset2D * src,const VkExtent2D * extent)736 r3d_coords(struct tu_cs *cs,
737            const VkOffset2D *dst,
738            const VkOffset2D *src,
739            const VkExtent2D *extent)
740 {
741    int32_t src_x1 = src ? src->x : 0;
742    int32_t src_y1 = src ? src->y : 0;
743    r3d_coords_raw(cs, (float[]) {
744       dst->x,                 dst->y,
745       src_x1,                 src_y1,
746       dst->x + extent->width, dst->y + extent->height,
747       src_x1 + extent->width, src_y1 + extent->height,
748    });
749 }
750 
751 static void
r3d_clear_value(struct tu_cs * cs,VkFormat format,const VkClearValue * val)752 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
753 {
754    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
755    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
756                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
757                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
758                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
759                   CP_LOAD_STATE6_0_NUM_UNIT(1));
760    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
761    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
762    switch (format) {
763    case VK_FORMAT_X8_D24_UNORM_PACK32:
764    case VK_FORMAT_D24_UNORM_S8_UINT: {
765       /* cleared as r8g8b8a8_unorm using special format */
766       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
767       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
768       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
769       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
770       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
771    } break;
772    case VK_FORMAT_D16_UNORM:
773    case VK_FORMAT_D32_SFLOAT:
774       tu_cs_emit(cs, fui(val->depthStencil.depth));
775       tu_cs_emit(cs, 0);
776       tu_cs_emit(cs, 0);
777       tu_cs_emit(cs, 0);
778       break;
779    case VK_FORMAT_S8_UINT:
780       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
781       tu_cs_emit(cs, 0);
782       tu_cs_emit(cs, 0);
783       tu_cs_emit(cs, 0);
784       break;
785    default:
786       /* as color formats use clear value as-is */
787       assert(!vk_format_is_depth_or_stencil(format));
788       tu_cs_emit_array(cs, val->color.uint32, 4);
789       break;
790    }
791 }
792 
793 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)794 r3d_src_common(struct tu_cmd_buffer *cmd,
795                struct tu_cs *cs,
796                const uint32_t *tex_const,
797                uint32_t offset_base,
798                uint32_t offset_ubwc,
799                VkFilter filter)
800 {
801    struct tu_cs_memory texture = { };
802    VkResult result = tu_cs_alloc(&cmd->sub_cs,
803                                  2, /* allocate space for a sampler too */
804                                  A6XX_TEX_CONST_DWORDS, &texture);
805    if (result != VK_SUCCESS) {
806       cmd->record_result = result;
807       return;
808    }
809 
810    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
811 
812    /* patch addresses for layer offset */
813    *(uint64_t*) (texture.map + 4) += offset_base;
814    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
815    texture.map[7] = ubwc_addr;
816    texture.map[8] = ubwc_addr >> 32;
817 
818    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
819       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
820       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
821       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
822       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
823       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
824       0x60000; /* XXX used by blob, doesn't seem necessary */
825    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
826       0x1 | /* XXX used by blob, doesn't seem necessary */
827       A6XX_TEX_SAMP_1_UNNORM_COORDS |
828       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
829    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
830    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
831 
832    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
833    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
834                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
835                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
836                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
837                CP_LOAD_STATE6_0_NUM_UNIT(1));
838    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
839 
840    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
841 
842    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
843    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
844       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
845       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
846       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
847       CP_LOAD_STATE6_0_NUM_UNIT(1));
848    tu_cs_emit_qw(cs, texture.iova);
849 
850    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
851    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
852 }
853 
854 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)855 r3d_src(struct tu_cmd_buffer *cmd,
856         struct tu_cs *cs,
857         const struct tu_image_view *iview,
858         uint32_t layer,
859         VkFilter filter)
860 {
861    r3d_src_common(cmd, cs, iview->descriptor,
862                   iview->layer_size * layer,
863                   iview->ubwc_layer_size * layer,
864                   filter);
865 }
866 
867 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height)868 r3d_src_buffer(struct tu_cmd_buffer *cmd,
869                struct tu_cs *cs,
870                VkFormat vk_format,
871                uint64_t va, uint32_t pitch,
872                uint32_t width, uint32_t height)
873 {
874    uint32_t desc[A6XX_TEX_CONST_DWORDS];
875 
876    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
877 
878    desc[0] =
879       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
880       A6XX_TEX_CONST_0_FMT(format.fmt) |
881       A6XX_TEX_CONST_0_SWAP(format.swap) |
882       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
883       // XXX to swizzle into .w for stencil buffer_to_image
884       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
885       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
886       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
887    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
888    desc[2] =
889       A6XX_TEX_CONST_2_PITCH(pitch) |
890       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
891    desc[3] = 0;
892    desc[4] = va;
893    desc[5] = va >> 32;
894    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
895       desc[i] = 0;
896 
897    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
898 }
899 
900 static void
r3d_src_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,VkFormat format,uint32_t gmem_offset,uint32_t cpp)901 r3d_src_gmem(struct tu_cmd_buffer *cmd,
902              struct tu_cs *cs,
903              const struct tu_image_view *iview,
904              VkFormat format,
905              uint32_t gmem_offset,
906              uint32_t cpp)
907 {
908    uint32_t desc[A6XX_TEX_CONST_DWORDS];
909    memcpy(desc, iview->descriptor, sizeof(desc));
910 
911    /* patch the format so that depth/stencil get the right format */
912    desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
913    desc[0] |= A6XX_TEX_CONST_0_FMT(tu6_format_texture(format, TILE6_2).fmt);
914 
915    /* patched for gmem */
916    desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
917    desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
918    desc[2] =
919       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
920       A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);
921    desc[3] = 0;
922    desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
923    desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
924    for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
925       desc[i] = 0;
926 
927    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
928 }
929 
930 static void
r3d_dst(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)931 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
932 {
933    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
934    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
935    tu_cs_image_ref(cs, iview, layer);
936    tu_cs_emit(cs, 0);
937 
938    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
939    tu_cs_image_flag_ref(cs, iview, layer);
940 
941    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
942 }
943 
944 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)945 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
946 {
947    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
948    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
949    tu_cs_image_stencil_ref(cs, iview, layer);
950    tu_cs_emit(cs, 0);
951 
952    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
953 }
954 
955 static void
r3d_dst_buffer(struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch)956 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
957 {
958    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
959 
960    tu_cs_emit_regs(cs,
961                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
962                    A6XX_RB_MRT_PITCH(0, pitch),
963                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
964                    A6XX_RB_MRT_BASE(0, .qword = va),
965                    A6XX_RB_MRT_BASE_GMEM(0, 0));
966 
967    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
968 }
969 
970 static uint8_t
aspect_write_mask(VkFormat vk_format,VkImageAspectFlags aspect_mask)971 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
972 {
973    uint8_t mask = 0xf;
974    assert(aspect_mask);
975    /* note: the only format with partial writing is D24S8,
976     * clear/blit uses the _AS_R8G8B8A8 format to access it
977     */
978    if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
979       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
980          mask = 0x7;
981       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
982          mask = 0x8;
983    }
984    return mask;
985 }
986 
987 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)988 r3d_setup(struct tu_cmd_buffer *cmd,
989           struct tu_cs *cs,
990           VkFormat vk_format,
991           VkImageAspectFlags aspect_mask,
992           unsigned blit_param,
993           bool clear,
994           bool ubwc,
995           VkSampleCountFlagBits samples)
996 {
997    enum a6xx_format format = tu6_base_format(vk_format);
998 
999    if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
1000         vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
1001       format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
1002    }
1003 
1004    if (!cmd->state.pass) {
1005       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1006       tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1007    }
1008 
1009    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
1010    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
1011 
1012    r3d_common(cmd, cs, !clear, 1, blit_param, samples);
1013 
1014    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1015    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1016                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1017                   0xfc000000);
1018    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
1019 
1020    tu_cs_emit_regs(cs,
1021                    A6XX_RB_FS_OUTPUT_CNTL0(),
1022                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1023 
1024    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1025    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1026 
1027    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1028    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1029    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1030    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1031    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1032    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1033    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1034 
1035    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
1036    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
1037 
1038    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1039                         .color_format = format,
1040                         .color_sint = vk_format_is_sint(vk_format),
1041                         .color_uint = vk_format_is_uint(vk_format)));
1042 
1043    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1044       .component_enable = aspect_write_mask(vk_format, aspect_mask)));
1045    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
1046    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
1047 
1048    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1049    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1050 
1051    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1052                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1053 
1054    if (cmd->state.predication_active) {
1055       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1056       tu_cs_emit(cs, 0);
1057    }
1058 }
1059 
1060 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1061 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1062 {
1063    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1064    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1065                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1066                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1067    tu_cs_emit(cs, 1); /* instance count */
1068    tu_cs_emit(cs, 2); /* vertex count */
1069 }
1070 
1071 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1072 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1073 {
1074    if (cmd->state.predication_active) {
1075       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1076       tu_cs_emit(cs, 1);
1077    }
1078 }
1079 
1080 /* blit ops - common interface for 2d/shader paths */
1081 
1082 struct blit_ops {
1083    void (*coords)(struct tu_cs *cs,
1084                   const VkOffset2D *dst,
1085                   const VkOffset2D *src,
1086                   const VkExtent2D *extent);
1087    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
1088    void (*src)(
1089         struct tu_cmd_buffer *cmd,
1090         struct tu_cs *cs,
1091         const struct tu_image_view *iview,
1092         uint32_t layer,
1093         VkFilter filter);
1094    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1095                       VkFormat vk_format,
1096                       uint64_t va, uint32_t pitch,
1097                       uint32_t width, uint32_t height);
1098    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1099    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1100    void (*setup)(struct tu_cmd_buffer *cmd,
1101                  struct tu_cs *cs,
1102                  VkFormat vk_format,
1103                  VkImageAspectFlags aspect_mask,
1104                  unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1105                  bool clear,
1106                  bool ubwc,
1107                  VkSampleCountFlagBits samples);
1108    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1109    void (*teardown)(struct tu_cmd_buffer *cmd,
1110                     struct tu_cs *cs);
1111 };
1112 
1113 static const struct blit_ops r2d_ops = {
1114    .coords = r2d_coords,
1115    .clear_value = r2d_clear_value,
1116    .src = r2d_src,
1117    .src_buffer = r2d_src_buffer,
1118    .dst = r2d_dst,
1119    .dst_buffer = r2d_dst_buffer,
1120    .setup = r2d_setup,
1121    .run = r2d_run,
1122    .teardown = r2d_teardown,
1123 };
1124 
1125 static const struct blit_ops r3d_ops = {
1126    .coords = r3d_coords,
1127    .clear_value = r3d_clear_value,
1128    .src = r3d_src,
1129    .src_buffer = r3d_src_buffer,
1130    .dst = r3d_dst,
1131    .dst_buffer = r3d_dst_buffer,
1132    .setup = r3d_setup,
1133    .run = r3d_run,
1134    .teardown = r3d_teardown,
1135 };
1136 
1137 /* passthrough set coords from 3D extents */
1138 static void
coords(const struct blit_ops * ops,struct tu_cs * cs,const VkOffset3D * dst,const VkOffset3D * src,const VkExtent3D * extent)1139 coords(const struct blit_ops *ops,
1140        struct tu_cs *cs,
1141        const VkOffset3D *dst,
1142        const VkOffset3D *src,
1143        const VkExtent3D *extent)
1144 {
1145    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1146 }
1147 
1148 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1149  * to be a bit careful because we have to pick a format with matching UBWC
1150  * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1151  * everything.
1152  */
1153 static VkFormat
copy_format(VkFormat format,VkImageAspectFlags aspect_mask,bool copy_buffer)1154 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
1155 {
1156    if (vk_format_is_compressed(format)) {
1157       switch (vk_format_get_blocksize(format)) {
1158       case 1: return VK_FORMAT_R8_UINT;
1159       case 2: return VK_FORMAT_R16_UINT;
1160       case 4: return VK_FORMAT_R32_UINT;
1161       case 8: return VK_FORMAT_R32G32_UINT;
1162       case 16:return VK_FORMAT_R32G32B32A32_UINT;
1163       default:
1164          unreachable("unhandled format size");
1165       }
1166    }
1167 
1168    switch (format) {
1169    /* For SNORM formats, copy them as the equivalent UNORM format.  If we treat
1170     * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1171     * (also -1.0), when we're supposed to be memcpying the bits. See
1172     * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1173     */
1174    case VK_FORMAT_R8_SNORM:
1175       return VK_FORMAT_R8_UNORM;
1176    case VK_FORMAT_R8G8_SNORM:
1177       return VK_FORMAT_R8G8_UNORM;
1178    case VK_FORMAT_R8G8B8_SNORM:
1179       return VK_FORMAT_R8G8B8_UNORM;
1180    case VK_FORMAT_B8G8R8_SNORM:
1181       return VK_FORMAT_B8G8R8_UNORM;
1182    case VK_FORMAT_R8G8B8A8_SNORM:
1183       return VK_FORMAT_R8G8B8A8_UNORM;
1184    case VK_FORMAT_B8G8R8A8_SNORM:
1185       return VK_FORMAT_B8G8R8A8_UNORM;
1186    case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
1187       return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
1188    case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
1189       return VK_FORMAT_A2R10G10B10_UNORM_PACK32;
1190    case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
1191       return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
1192    case VK_FORMAT_R16_SNORM:
1193       return VK_FORMAT_R16_UNORM;
1194    case VK_FORMAT_R16G16_SNORM:
1195       return VK_FORMAT_R16G16_UNORM;
1196    case VK_FORMAT_R16G16B16_SNORM:
1197       return VK_FORMAT_R16G16B16_UNORM;
1198    case VK_FORMAT_R16G16B16A16_SNORM:
1199       return VK_FORMAT_R16G16B16A16_UNORM;
1200 
1201    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
1202       return VK_FORMAT_R32_UINT;
1203 
1204    case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
1205       if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
1206          return VK_FORMAT_R8G8_UNORM;
1207       else
1208          return VK_FORMAT_R8_UNORM;
1209    case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
1210       return VK_FORMAT_R8_UNORM;
1211 
1212    case VK_FORMAT_D24_UNORM_S8_UINT:
1213       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
1214          return VK_FORMAT_R8_UNORM;
1215       else
1216          return format;
1217 
1218    case VK_FORMAT_D32_SFLOAT_S8_UINT:
1219       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1220          return VK_FORMAT_S8_UINT;
1221       assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
1222       return VK_FORMAT_D32_SFLOAT;
1223 
1224    default:
1225       return format;
1226    }
1227 }
1228 
1229 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)1230 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
1231               struct tu_cs *cs,
1232               struct tu_image *image,
1233               const VkClearValue *value)
1234 {
1235    const struct blit_ops *ops = &r2d_ops;
1236 
1237    ops->setup(cmd, cs, VK_FORMAT_D16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
1238               VK_SAMPLE_COUNT_1_BIT);
1239    ops->clear_value(cs, VK_FORMAT_D16_UNORM, value);
1240    ops->dst_buffer(cs, VK_FORMAT_D16_UNORM,
1241                    image->bo->iova + image->bo_offset + image->lrz_offset,
1242                    image->lrz_pitch * 2);
1243    ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
1244    ops->run(cmd, cs);
1245    ops->teardown(cmd, cs);
1246 }
1247 
1248 static void
tu_image_view_copy_blit(struct tu_image_view * iview,struct tu_image * image,VkFormat format,const VkImageSubresourceLayers * subres,uint32_t layer,bool stencil_read,bool z_scale)1249 tu_image_view_copy_blit(struct tu_image_view *iview,
1250                         struct tu_image *image,
1251                         VkFormat format,
1252                         const VkImageSubresourceLayers *subres,
1253                         uint32_t layer,
1254                         bool stencil_read,
1255                         bool z_scale)
1256 {
1257    VkImageAspectFlags aspect_mask = subres->aspectMask;
1258 
1259    /* always use the AS_R8G8B8A8 format for these */
1260    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
1261        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1262       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1263    }
1264 
1265    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
1266       .image = tu_image_to_handle(image),
1267       .viewType = z_scale ? VK_IMAGE_VIEW_TYPE_3D : VK_IMAGE_VIEW_TYPE_2D,
1268       .format = format,
1269       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
1270       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
1271       .subresourceRange = {
1272          .aspectMask = aspect_mask,
1273          .baseMipLevel = subres->mipLevel,
1274          .levelCount = 1,
1275          .baseArrayLayer = subres->baseArrayLayer + layer,
1276          .layerCount = 1,
1277       },
1278    }, false);
1279 }
1280 
1281 static void
tu_image_view_copy(struct tu_image_view * iview,struct tu_image * image,VkFormat format,const VkImageSubresourceLayers * subres,uint32_t layer,bool stencil_read)1282 tu_image_view_copy(struct tu_image_view *iview,
1283                    struct tu_image *image,
1284                    VkFormat format,
1285                    const VkImageSubresourceLayers *subres,
1286                    uint32_t layer,
1287                    bool stencil_read)
1288 {
1289    format = copy_format(format, subres->aspectMask, false);
1290    tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read, false);
1291 }
1292 
1293 static void
tu_image_view_blit(struct tu_image_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)1294 tu_image_view_blit(struct tu_image_view *iview,
1295                    struct tu_image *image,
1296                    const VkImageSubresourceLayers *subres,
1297                    uint32_t layer)
1298 {
1299    tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false, false);
1300 }
1301 
1302 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit * info,VkFilter filter)1303 tu6_blit_image(struct tu_cmd_buffer *cmd,
1304                struct tu_image *src_image,
1305                struct tu_image *dst_image,
1306                const VkImageBlit *info,
1307                VkFilter filter)
1308 {
1309    const struct blit_ops *ops = &r2d_ops;
1310    struct tu_cs *cs = &cmd->cs;
1311    bool z_scale = false;
1312    uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
1313 
1314    /* 2D blit can't do rotation mirroring from just coordinates */
1315    static const enum a6xx_rotation rotate[2][2] = {
1316       {ROTATE_0, ROTATE_HFLIP},
1317       {ROTATE_VFLIP, ROTATE_180},
1318    };
1319 
1320    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1321                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
1322    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1323                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
1324 
1325    int32_t src0_z = info->srcOffsets[0].z;
1326    int32_t src1_z = info->srcOffsets[1].z;
1327 
1328    if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
1329         info->dstOffsets[1].z - info->dstOffsets[0].z) ||
1330        info->srcOffsets[1].z < info->srcOffsets[0].z) {
1331       z_scale = true;
1332    }
1333 
1334    if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
1335       layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
1336       src0_z = info->srcOffsets[1].z;
1337       src1_z = info->srcOffsets[0].z;
1338    }
1339 
1340    if (info->dstSubresource.layerCount > 1) {
1341       assert(layers <= 1);
1342       layers = info->dstSubresource.layerCount;
1343    }
1344 
1345    /* BC1_RGB_* formats need to have their last components overriden with 1
1346     * when sampling, which is normally handled with the texture descriptor
1347     * swizzle. The 2d path can't handle that, so use the 3d path.
1348     *
1349     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1350     * the 2d path.
1351     */
1352 
1353    unsigned blit_param = rotate[mirror_y][mirror_x];
1354    if (dst_image->layout[0].nr_samples > 1 ||
1355        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1356        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1357        filter == VK_FILTER_CUBIC_EXT ||
1358        z_scale) {
1359       ops = &r3d_ops;
1360       blit_param = z_scale;
1361    }
1362 
1363    /* use the right format in setup() for D32_S8
1364     * TODO: this probably should use a helper
1365     */
1366    VkFormat format = dst_image->vk_format;
1367    if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1368       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1369          format = VK_FORMAT_D32_SFLOAT;
1370       else if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1371          format = VK_FORMAT_S8_UINT;
1372       else
1373          unreachable("unexpected D32_S8 aspect mask in blit_image");
1374    }
1375 
1376    trace_start_blit(&cmd->trace, cs);
1377 
1378    ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1379               blit_param, false, dst_image->layout[0].ubwc,
1380               dst_image->layout[0].nr_samples);
1381 
1382    if (ops == &r3d_ops) {
1383       r3d_coords_raw(cs, (float[]) {
1384          info->dstOffsets[0].x, info->dstOffsets[0].y,
1385          info->srcOffsets[0].x, info->srcOffsets[0].y,
1386          info->dstOffsets[1].x, info->dstOffsets[1].y,
1387          info->srcOffsets[1].x, info->srcOffsets[1].y
1388       });
1389    } else {
1390       tu_cs_emit_regs(cs,
1391          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1392                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1393          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1394                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1395       tu_cs_emit_regs(cs,
1396          A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1397          A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1398          A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1399          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1400    }
1401 
1402    struct tu_image_view dst, src;
1403    tu_image_view_blit(&dst, dst_image, &info->dstSubresource,
1404                       MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
1405 
1406    if (z_scale) {
1407       tu_image_view_copy_blit(&src, src_image, src_image->vk_format,
1408                               &info->srcSubresource, 0, false, true);
1409       ops->src(cmd, cs, &src, 0, filter);
1410    } else {
1411       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1412    }
1413 
1414    for (uint32_t i = 0; i < layers; i++) {
1415       if (z_scale) {
1416          float t = ((float) i + 0.5f) / (float) layers;
1417          r3d_coord_z(cs, t * (src1_z - src0_z) + src0_z);
1418       } else {
1419          ops->src(cmd, cs, &src, i, filter);
1420       }
1421       ops->dst(cs, &dst, i);
1422       ops->run(cmd, cs);
1423    }
1424 
1425    ops->teardown(cmd, cs);
1426 
1427    trace_end_blit(&cmd->trace, cs,
1428                   ops == &r3d_ops,
1429                   src_image->vk_format,
1430                   dst_image->vk_format,
1431                   layers);
1432 }
1433 
1434 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageBlit * pRegions,VkFilter filter)1435 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1436                 VkImage srcImage,
1437                 VkImageLayout srcImageLayout,
1438                 VkImage dstImage,
1439                 VkImageLayout dstImageLayout,
1440                 uint32_t regionCount,
1441                 const VkImageBlit *pRegions,
1442                 VkFilter filter)
1443 
1444 {
1445    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1446    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1447    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1448 
1449    for (uint32_t i = 0; i < regionCount; ++i) {
1450       /* can't blit both depth and stencil at once with D32_S8
1451        * TODO: more advanced 3D blit path to support it instead?
1452        */
1453       if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1454           dst_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1455          VkImageBlit region = pRegions[i];
1456          u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) {
1457             region.srcSubresource.aspectMask = BIT(b);
1458             region.dstSubresource.aspectMask = BIT(b);
1459             tu6_blit_image(cmd, src_image, dst_image, &region, filter);
1460          }
1461          continue;
1462       }
1463       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1464    }
1465 }
1466 
1467 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)1468 copy_compressed(VkFormat format,
1469                 VkOffset3D *offset,
1470                 VkExtent3D *extent,
1471                 uint32_t *width,
1472                 uint32_t *height)
1473 {
1474    if (!vk_format_is_compressed(format))
1475       return;
1476 
1477    uint32_t block_width = vk_format_get_blockwidth(format);
1478    uint32_t block_height = vk_format_get_blockheight(format);
1479 
1480    offset->x /= block_width;
1481    offset->y /= block_height;
1482 
1483    if (extent) {
1484       extent->width = DIV_ROUND_UP(extent->width, block_width);
1485       extent->height = DIV_ROUND_UP(extent->height, block_height);
1486    }
1487    if (width)
1488       *width = DIV_ROUND_UP(*width, block_width);
1489    if (height)
1490       *height = DIV_ROUND_UP(*height, block_height);
1491 }
1492 
1493 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy * info)1494 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1495                         struct tu_buffer *src_buffer,
1496                         struct tu_image *dst_image,
1497                         const VkBufferImageCopy *info)
1498 {
1499    struct tu_cs *cs = &cmd->cs;
1500    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1501    VkFormat src_format =
1502       copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1503    const struct blit_ops *ops = &r2d_ops;
1504 
1505    /* special case for buffer to stencil */
1506    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1507        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1508       ops = &r3d_ops;
1509    }
1510 
1511    /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1512     * which matters for UBWC. buffer_to_image/etc can fail because of this
1513     */
1514 
1515    VkOffset3D offset = info->imageOffset;
1516    VkExtent3D extent = info->imageExtent;
1517    uint32_t src_width = info->bufferRowLength ?: extent.width;
1518    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1519 
1520    copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1521 
1522    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1523    uint32_t layer_size = src_height * pitch;
1524 
1525    ops->setup(cmd, cs,
1526               copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1527               info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc,
1528               dst_image->layout[0].nr_samples);
1529 
1530    struct tu_image_view dst;
1531    tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1532 
1533    for (uint32_t i = 0; i < layers; i++) {
1534       ops->dst(cs, &dst, i);
1535 
1536       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1537       if ((src_va & 63) || (pitch & 63)) {
1538          for (uint32_t y = 0; y < extent.height; y++) {
1539             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1540             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1541                             x + extent.width, 1);
1542             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1543                         &(VkExtent2D) {extent.width, 1});
1544             ops->run(cmd, cs);
1545             src_va += pitch;
1546          }
1547       } else {
1548          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1549          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1550          ops->run(cmd, cs);
1551       }
1552    }
1553 
1554    ops->teardown(cmd, cs);
1555 }
1556 
1557 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,VkBuffer srcBuffer,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkBufferImageCopy * pRegions)1558 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1559                         VkBuffer srcBuffer,
1560                         VkImage dstImage,
1561                         VkImageLayout dstImageLayout,
1562                         uint32_t regionCount,
1563                         const VkBufferImageCopy *pRegions)
1564 {
1565    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1566    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1567    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1568 
1569    for (unsigned i = 0; i < regionCount; ++i)
1570       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1571 }
1572 
1573 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy * info)1574 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1575                         struct tu_image *src_image,
1576                         struct tu_buffer *dst_buffer,
1577                         const VkBufferImageCopy *info)
1578 {
1579    struct tu_cs *cs = &cmd->cs;
1580    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1581    VkFormat dst_format =
1582       copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1583    bool stencil_read = false;
1584 
1585    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1586        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1587       stencil_read = true;
1588    }
1589 
1590    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1591    VkOffset3D offset = info->imageOffset;
1592    VkExtent3D extent = info->imageExtent;
1593    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1594    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1595 
1596    copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1597 
1598    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1599    uint32_t layer_size = pitch * dst_height;
1600 
1601    ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1602               VK_SAMPLE_COUNT_1_BIT);
1603 
1604    struct tu_image_view src;
1605    tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1606 
1607    for (uint32_t i = 0; i < layers; i++) {
1608       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1609 
1610       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1611       if ((dst_va & 63) || (pitch & 63)) {
1612          for (uint32_t y = 0; y < extent.height; y++) {
1613             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1614             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1615             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1616                         &(VkExtent2D) {extent.width, 1});
1617             ops->run(cmd, cs);
1618             dst_va += pitch;
1619          }
1620       } else {
1621          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1622          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1623          ops->run(cmd, cs);
1624       }
1625    }
1626 
1627    ops->teardown(cmd, cs);
1628 }
1629 
1630 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkBuffer dstBuffer,uint32_t regionCount,const VkBufferImageCopy * pRegions)1631 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1632                         VkImage srcImage,
1633                         VkImageLayout srcImageLayout,
1634                         VkBuffer dstBuffer,
1635                         uint32_t regionCount,
1636                         const VkBufferImageCopy *pRegions)
1637 {
1638    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1639    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1640    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1641 
1642    for (unsigned i = 0; i < regionCount; ++i)
1643       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1644 }
1645 
1646 /* Tiled formats don't support swapping, which means that we can't support
1647  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1648  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1649  * Currently we fake support for tiled swapped formats and use the unswapped
1650  * format instead, but this means that reinterpreting copies to and from
1651  * swapped formats can't be performed correctly unless we can swizzle the
1652  * components by reinterpreting the other image as the "correct" swapped
1653  * format, i.e. only when the other image is linear.
1654  */
1655 
1656 static bool
is_swapped_format(VkFormat format)1657 is_swapped_format(VkFormat format)
1658 {
1659    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1660    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1661    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1662 }
1663 
1664 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1665  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1666  * versa). This should mirror the logic in fdl6_layout.
1667  */
1668 static bool
image_is_r8g8(struct tu_image * image)1669 image_is_r8g8(struct tu_image *image)
1670 {
1671    return image->layout[0].cpp == 2 &&
1672       vk_format_get_nr_components(image->vk_format) == 2;
1673 }
1674 
1675 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy * info)1676 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1677                        struct tu_image *src_image,
1678                        struct tu_image *dst_image,
1679                        const VkImageCopy *info)
1680 {
1681    const struct blit_ops *ops = &r2d_ops;
1682    struct tu_cs *cs = &cmd->cs;
1683 
1684    if (dst_image->layout[0].nr_samples > 1)
1685       ops = &r3d_ops;
1686 
1687    VkFormat format = VK_FORMAT_UNDEFINED;
1688    VkOffset3D src_offset = info->srcOffset;
1689    VkOffset3D dst_offset = info->dstOffset;
1690    VkExtent3D extent = info->extent;
1691    uint32_t layers_to_copy = MAX2(info->extent.depth, info->srcSubresource.layerCount);
1692 
1693    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1694     * Images":
1695     *
1696     *    When copying between compressed and uncompressed formats the extent
1697     *    members represent the texel dimensions of the source image and not
1698     *    the destination. When copying from a compressed image to an
1699     *    uncompressed image the image texel dimensions written to the
1700     *    uncompressed image will be source extent divided by the compressed
1701     *    texel block dimensions. When copying from an uncompressed image to a
1702     *    compressed image the image texel dimensions written to the compressed
1703     *    image will be the source extent multiplied by the compressed texel
1704     *    block dimensions.
1705     *
1706     * This means we only have to adjust the extent if the source image is
1707     * compressed.
1708     */
1709    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1710    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1711 
1712    VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1713    VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1714 
1715    bool use_staging_blit = false;
1716 
1717    if (src_format == dst_format) {
1718       /* Images that share a format can always be copied directly because it's
1719        * the same as a blit.
1720        */
1721       format = src_format;
1722    } else if (!src_image->layout[0].tile_mode) {
1723       /* If an image is linear, we can always safely reinterpret it with the
1724        * other image's format and then do a regular blit.
1725        */
1726       format = dst_format;
1727    } else if (!dst_image->layout[0].tile_mode) {
1728       format = src_format;
1729    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1730       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1731        * due to the different tile layout.
1732        */
1733       use_staging_blit = true;
1734    } else if (is_swapped_format(src_format) ||
1735               is_swapped_format(dst_format)) {
1736       /* If either format has a non-identity swap, then we can't copy
1737        * to/from it.
1738        */
1739       use_staging_blit = true;
1740    } else if (!src_image->layout[0].ubwc) {
1741       format = dst_format;
1742    } else if (!dst_image->layout[0].ubwc) {
1743       format = src_format;
1744    } else {
1745       /* Both formats use UBWC and so neither can be reinterpreted.
1746        * TODO: We could do an in-place decompression of the dst instead.
1747        */
1748       use_staging_blit = true;
1749    }
1750 
1751    struct tu_image_view dst, src;
1752 
1753    if (use_staging_blit) {
1754       tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1755       tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1756 
1757       struct tu_image staging_image = {
1758          .base.type = VK_OBJECT_TYPE_IMAGE,
1759          .vk_format = src_format,
1760          .level_count = 1,
1761          .layer_count = info->srcSubresource.layerCount,
1762          .bo_offset = 0,
1763       };
1764 
1765       VkImageSubresourceLayers staging_subresource = {
1766          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1767          .mipLevel = 0,
1768          .baseArrayLayer = 0,
1769          .layerCount = info->srcSubresource.layerCount,
1770       };
1771 
1772       VkOffset3D staging_offset = { 0 };
1773 
1774       staging_image.layout[0].tile_mode = TILE6_LINEAR;
1775       staging_image.layout[0].ubwc = false;
1776 
1777       fdl6_layout(&staging_image.layout[0],
1778                   vk_format_to_pipe_format(staging_image.vk_format),
1779                   src_image->layout[0].nr_samples,
1780                   extent.width,
1781                   extent.height,
1782                   extent.depth,
1783                   staging_image.level_count,
1784                   staging_image.layer_count,
1785                   extent.depth > 1,
1786                   NULL);
1787 
1788       VkResult result = tu_get_scratch_bo(cmd->device,
1789                                           staging_image.layout[0].size,
1790                                           &staging_image.bo);
1791       if (result != VK_SUCCESS) {
1792          cmd->record_result = result;
1793          return;
1794       }
1795 
1796       struct tu_image_view staging;
1797       tu_image_view_copy(&staging, &staging_image, src_format,
1798                          &staging_subresource, 0, false);
1799 
1800       ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1801                  dst_image->layout[0].nr_samples);
1802       coords(ops, cs, &staging_offset, &src_offset, &extent);
1803 
1804       for (uint32_t i = 0; i < layers_to_copy; i++) {
1805          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1806          ops->dst(cs, &staging, i);
1807          ops->run(cmd, cs);
1808       }
1809 
1810       /* When executed by the user there has to be a pipeline barrier here,
1811        * but since we're doing it manually we'll have to flush ourselves.
1812        */
1813       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1814       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1815       tu_cs_emit_wfi(cs);
1816 
1817       tu_image_view_copy(&staging, &staging_image, dst_format,
1818                          &staging_subresource, 0, false);
1819 
1820       ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
1821                  0, false, dst_image->layout[0].ubwc,
1822                  dst_image->layout[0].nr_samples);
1823       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1824 
1825       for (uint32_t i = 0; i < layers_to_copy; i++) {
1826          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1827          ops->dst(cs, &dst, i);
1828          ops->run(cmd, cs);
1829       }
1830    } else {
1831       tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1832       tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1833 
1834       ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1835                  0, false, dst_image->layout[0].ubwc,
1836                  dst_image->layout[0].nr_samples);
1837       coords(ops, cs, &dst_offset, &src_offset, &extent);
1838 
1839       for (uint32_t i = 0; i < layers_to_copy; i++) {
1840          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1841          ops->dst(cs, &dst, i);
1842          ops->run(cmd, cs);
1843       }
1844    }
1845 
1846    ops->teardown(cmd, cs);
1847 }
1848 
1849 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage destImage,VkImageLayout destImageLayout,uint32_t regionCount,const VkImageCopy * pRegions)1850 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1851                 VkImage srcImage,
1852                 VkImageLayout srcImageLayout,
1853                 VkImage destImage,
1854                 VkImageLayout destImageLayout,
1855                 uint32_t regionCount,
1856                 const VkImageCopy *pRegions)
1857 {
1858    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1859    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1860    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1861 
1862    for (uint32_t i = 0; i < regionCount; ++i) {
1863       if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1864          VkImageCopy info = pRegions[i];
1865          u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) {
1866             info.srcSubresource.aspectMask = BIT(b);
1867             info.dstSubresource.aspectMask = BIT(b);
1868             tu_copy_image_to_image(cmd, src_image, dst_image, &info);
1869          }
1870          continue;
1871       }
1872 
1873       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1874    }
1875 }
1876 
1877 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size)1878 copy_buffer(struct tu_cmd_buffer *cmd,
1879             uint64_t dst_va,
1880             uint64_t src_va,
1881             uint64_t size,
1882             uint32_t block_size)
1883 {
1884    const struct blit_ops *ops = &r2d_ops;
1885    struct tu_cs *cs = &cmd->cs;
1886    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1887    uint64_t blocks = size / block_size;
1888 
1889    ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1890               VK_SAMPLE_COUNT_1_BIT);
1891 
1892    while (blocks) {
1893       uint32_t src_x = (src_va & 63) / block_size;
1894       uint32_t dst_x = (dst_va & 63) / block_size;
1895       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1896 
1897       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1898       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1899       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1900       ops->run(cmd, cs);
1901 
1902       src_va += width * block_size;
1903       dst_va += width * block_size;
1904       blocks -= width;
1905    }
1906 
1907    ops->teardown(cmd, cs);
1908 }
1909 
1910 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,VkBuffer srcBuffer,VkBuffer dstBuffer,uint32_t regionCount,const VkBufferCopy * pRegions)1911 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1912                  VkBuffer srcBuffer,
1913                  VkBuffer dstBuffer,
1914                  uint32_t regionCount,
1915                  const VkBufferCopy *pRegions)
1916 {
1917    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1918    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1919    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1920 
1921    for (unsigned i = 0; i < regionCount; ++i) {
1922       copy_buffer(cmd,
1923                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1924                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1925                   pRegions[i].size, 1);
1926    }
1927 }
1928 
1929 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)1930 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1931                    VkBuffer dstBuffer,
1932                    VkDeviceSize dstOffset,
1933                    VkDeviceSize dataSize,
1934                    const void *pData)
1935 {
1936    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1937    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1938 
1939    struct tu_cs_memory tmp;
1940    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
1941    if (result != VK_SUCCESS) {
1942       cmd->record_result = result;
1943       return;
1944    }
1945 
1946    memcpy(tmp.map, pData, dataSize);
1947    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1948 }
1949 
1950 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)1951 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1952                  VkBuffer dstBuffer,
1953                  VkDeviceSize dstOffset,
1954                  VkDeviceSize fillSize,
1955                  uint32_t data)
1956 {
1957    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1958    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1959    const struct blit_ops *ops = &r2d_ops;
1960    struct tu_cs *cs = &cmd->cs;
1961 
1962    if (fillSize == VK_WHOLE_SIZE)
1963       fillSize = buffer->size - dstOffset;
1964 
1965    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1966    uint32_t blocks = fillSize / 4;
1967 
1968    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
1969               VK_SAMPLE_COUNT_1_BIT);
1970    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1971 
1972    while (blocks) {
1973       uint32_t dst_x = (dst_va & 63) / 4;
1974       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1975 
1976       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1977       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1978       ops->run(cmd, cs);
1979 
1980       dst_va += width * 4;
1981       blocks -= width;
1982    }
1983 
1984    ops->teardown(cmd, cs);
1985 }
1986 
1987 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageResolve * pRegions)1988 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1989                    VkImage srcImage,
1990                    VkImageLayout srcImageLayout,
1991                    VkImage dstImage,
1992                    VkImageLayout dstImageLayout,
1993                    uint32_t regionCount,
1994                    const VkImageResolve *pRegions)
1995 {
1996    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1997    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1998    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1999    const struct blit_ops *ops = &r2d_ops;
2000    struct tu_cs *cs = &cmd->cs;
2001 
2002    ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
2003               0, false, dst_image->layout[0].ubwc, VK_SAMPLE_COUNT_1_BIT);
2004 
2005    for (uint32_t i = 0; i < regionCount; ++i) {
2006       const VkImageResolve *info = &pRegions[i];
2007       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
2008 
2009       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
2010       /* TODO: aspect masks possible ? */
2011 
2012       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
2013 
2014       struct tu_image_view dst, src;
2015       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
2016       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
2017 
2018       for (uint32_t i = 0; i < layers; i++) {
2019          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
2020          ops->dst(cs, &dst, i);
2021          ops->run(cmd, cs);
2022       }
2023    }
2024 
2025    ops->teardown(cmd, cs);
2026 }
2027 
2028 #define for_each_layer(layer, layer_mask, layers) \
2029    for (uint32_t layer = 0; \
2030         layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
2031         layer++) \
2032       if (!layer_mask || (layer_mask & BIT(layer)))
2033 
2034 static void
resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat format,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect,bool separate_stencil)2035 resolve_sysmem(struct tu_cmd_buffer *cmd,
2036                struct tu_cs *cs,
2037                VkFormat format,
2038                const struct tu_image_view *src,
2039                const struct tu_image_view *dst,
2040                uint32_t layer_mask,
2041                uint32_t layers,
2042                const VkRect2D *rect,
2043                bool separate_stencil)
2044 {
2045    const struct blit_ops *ops = &r2d_ops;
2046 
2047    trace_start_sysmem_resolve(&cmd->trace, cs);
2048 
2049    ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT,
2050               0, false, dst->ubwc_enabled, VK_SAMPLE_COUNT_1_BIT);
2051    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
2052 
2053    for_each_layer(i, layer_mask, layers) {
2054       if (separate_stencil) {
2055          r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);
2056          r2d_dst_stencil(cs, dst, i);
2057       } else {
2058          ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
2059          ops->dst(cs, dst, i);
2060       }
2061       ops->run(cmd, cs);
2062    }
2063 
2064    ops->teardown(cmd, cs);
2065 
2066    trace_end_sysmem_resolve(&cmd->trace, cs, format);
2067 }
2068 
2069 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)2070 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
2071                   struct tu_cs *cs,
2072                   const struct tu_image_view *src,
2073                   const struct tu_image_view *dst,
2074                   uint32_t layer_mask,
2075                   uint32_t layers,
2076                   const VkRect2D *rect)
2077 {
2078    assert(src->image->vk_format == dst->image->vk_format);
2079 
2080    if (dst->image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2081       resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT,
2082                      src, dst, layer_mask, layers, rect, false);
2083       resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT,
2084                      src, dst, layer_mask, layers, rect, true);
2085    } else {
2086       resolve_sysmem(cmd, cs, dst->image->vk_format,
2087                      src, dst, layer_mask, layers, rect, false);
2088    }
2089 }
2090 
2091 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)2092 clear_image(struct tu_cmd_buffer *cmd,
2093             struct tu_image *image,
2094             const VkClearValue *clear_value,
2095             const VkImageSubresourceRange *range,
2096             VkImageAspectFlags aspect_mask)
2097 {
2098    uint32_t level_count = tu_get_levelCount(image, range);
2099    uint32_t layer_count = tu_get_layerCount(image, range);
2100    struct tu_cs *cs = &cmd->cs;
2101    VkFormat format = image->vk_format;
2102    if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2103       format = copy_format(format, aspect_mask, false);
2104 
2105    if (image->layout[0].depth0 > 1) {
2106       assert(layer_count == 1);
2107       assert(range->baseArrayLayer == 0);
2108    }
2109 
2110    const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops;
2111 
2112    ops->setup(cmd, cs, format, aspect_mask, 0, true, image->layout[0].ubwc,
2113               image->layout[0].nr_samples);
2114    if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2115       ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
2116    else
2117       ops->clear_value(cs, format, clear_value);
2118 
2119    for (unsigned j = 0; j < level_count; j++) {
2120       if (image->layout[0].depth0 > 1)
2121          layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
2122 
2123       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
2124                      u_minify(image->layout[0].width0, range->baseMipLevel + j),
2125                      u_minify(image->layout[0].height0, range->baseMipLevel + j)
2126                   });
2127 
2128       struct tu_image_view dst;
2129       tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
2130          .aspectMask = aspect_mask,
2131          .mipLevel = range->baseMipLevel + j,
2132          .baseArrayLayer = range->baseArrayLayer,
2133          .layerCount = 1,
2134       }, 0, false, false);
2135 
2136       for (uint32_t i = 0; i < layer_count; i++) {
2137          ops->dst(cs, &dst, i);
2138          ops->run(cmd, cs);
2139       }
2140    }
2141 
2142    ops->teardown(cmd, cs);
2143 }
2144 
2145 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)2146 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
2147                       VkImage image_h,
2148                       VkImageLayout imageLayout,
2149                       const VkClearColorValue *pColor,
2150                       uint32_t rangeCount,
2151                       const VkImageSubresourceRange *pRanges)
2152 {
2153    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2154    TU_FROM_HANDLE(tu_image, image, image_h);
2155 
2156    for (unsigned i = 0; i < rangeCount; i++)
2157       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
2158 }
2159 
2160 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)2161 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
2162                              VkImage image_h,
2163                              VkImageLayout imageLayout,
2164                              const VkClearDepthStencilValue *pDepthStencil,
2165                              uint32_t rangeCount,
2166                              const VkImageSubresourceRange *pRanges)
2167 {
2168    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2169    TU_FROM_HANDLE(tu_image, image, image_h);
2170 
2171    for (unsigned i = 0; i < rangeCount; i++) {
2172       const VkImageSubresourceRange *range = &pRanges[i];
2173 
2174       if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2175          /* can't clear both depth and stencil at once, split up the aspect mask */
2176          u_foreach_bit(b, range->aspectMask)
2177             clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
2178          continue;
2179       }
2180 
2181       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
2182    }
2183 }
2184 
2185 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)2186 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2187                             uint32_t attachment_count,
2188                             const VkClearAttachment *attachments,
2189                             uint32_t rect_count,
2190                             const VkClearRect *rects)
2191 {
2192    /* the shader path here is special, it avoids changing MRT/etc state */
2193    const struct tu_subpass *subpass = cmd->state.subpass;
2194    const uint32_t mrt_count = subpass->color_count;
2195    struct tu_cs *cs = &cmd->draw_cs;
2196    uint32_t clear_value[MAX_RTS][4];
2197    float z_clear_val = 0.0f;
2198    uint8_t s_clear_val = 0;
2199    uint32_t clear_rts = 0, clear_components = 0;
2200    bool z_clear = false;
2201    bool s_clear = false;
2202 
2203    trace_start_sysmem_clear_all(&cmd->trace, cs);
2204 
2205    for (uint32_t i = 0; i < attachment_count; i++) {
2206       uint32_t a;
2207       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2208          uint32_t c = attachments[i].colorAttachment;
2209          a = subpass->color_attachments[c].attachment;
2210          if (a == VK_ATTACHMENT_UNUSED)
2211             continue;
2212 
2213          clear_rts |= 1 << c;
2214          clear_components |= 0xf << (c * 4);
2215          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2216       } else {
2217          a = subpass->depth_stencil_attachment.attachment;
2218          if (a == VK_ATTACHMENT_UNUSED)
2219             continue;
2220 
2221          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2222             z_clear = true;
2223             z_clear_val = attachments[i].clearValue.depthStencil.depth;
2224          }
2225 
2226          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2227             s_clear = true;
2228             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2229          }
2230       }
2231    }
2232 
2233    /* We may not know the multisample count if there are no attachments, so
2234     * just bail early to avoid corner cases later.
2235     */
2236    if (clear_rts == 0 && !z_clear && !s_clear)
2237       return;
2238 
2239    /* disable all draw states so they don't interfere
2240     * TODO: use and re-use draw states
2241     * we have to disable draw states individually to preserve
2242     * input attachment states, because a secondary command buffer
2243     * won't be able to restore them
2244     */
2245    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
2246    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
2247       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
2248           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
2249          continue;
2250       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
2251                      CP_SET_DRAW_STATE__0_DISABLE);
2252       tu_cs_emit_qw(cs, 0);
2253    }
2254    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2255 
2256    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2257    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2258                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2259                   0xfc000000);
2260    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2261 
2262    r3d_common(cmd, cs, false, clear_rts, false, cmd->state.subpass->samples);
2263 
2264    tu_cs_emit_regs(cs,
2265                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2266    tu_cs_emit_regs(cs,
2267                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2268 
2269    tu_cs_emit_regs(cs,
2270                    A6XX_RB_FS_OUTPUT_CNTL0(),
2271                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2272 
2273    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2274    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2275    for (uint32_t i = 0; i < mrt_count; i++) {
2276       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2277             .component_enable = COND(clear_rts & (1 << i), 0xf)));
2278    }
2279 
2280    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
2281    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
2282 
2283    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2284    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2285          .z_test_enable = z_clear,
2286          .z_write_enable = z_clear,
2287          .zfunc = FUNC_ALWAYS));
2288    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2289    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2290          .stencil_enable = s_clear,
2291          .func = FUNC_ALWAYS,
2292          .zpass = STENCIL_REPLACE));
2293    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2294    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2295    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2296 
2297    unsigned num_rts = util_bitcount(clear_rts);
2298    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2299    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2300                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2301                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2302                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2303                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2304    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2305    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2306    u_foreach_bit(b, clear_rts)
2307       tu_cs_emit_array(cs, clear_value[b], 4);
2308 
2309    for (uint32_t i = 0; i < rect_count; i++) {
2310       /* This should be true because of this valid usage for
2311        * vkCmdClearAttachments:
2312        *
2313        *    "If the render pass instance this is recorded in uses multiview,
2314        *    then baseArrayLayer must be zero and layerCount must be one"
2315        */
2316       assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
2317 
2318       /* a630 doesn't support multiview masks, which means that we can't use
2319        * the normal multiview path without potentially recompiling a shader
2320        * on-demand or using a more complicated variant that takes the mask as
2321        * a const. Just use the layered path instead, since it shouldn't be
2322        * much worse.
2323        */
2324       for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {
2325          r3d_coords_raw(cs, (float[]) {
2326             rects[i].rect.offset.x, rects[i].rect.offset.y,
2327             z_clear_val, uif(rects[i].baseArrayLayer + layer),
2328             rects[i].rect.offset.x + rects[i].rect.extent.width,
2329             rects[i].rect.offset.y + rects[i].rect.extent.height,
2330             z_clear_val, 1.0f,
2331          });
2332          r3d_run(cmd, cs);
2333       }
2334    }
2335 
2336    trace_end_sysmem_clear_all(&cmd->trace,
2337                               cs, mrt_count, rect_count);
2338 }
2339 
2340 static void
pack_gmem_clear_value(const VkClearValue * val,VkFormat format,uint32_t clear_value[4])2341 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
2342 {
2343    switch (format) {
2344    case VK_FORMAT_X8_D24_UNORM_PACK32:
2345    case VK_FORMAT_D24_UNORM_S8_UINT:
2346       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2347                        val->depthStencil.stencil << 24;
2348       return;
2349    case VK_FORMAT_D16_UNORM:
2350       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2351       return;
2352    case VK_FORMAT_D32_SFLOAT:
2353       clear_value[0] = fui(val->depthStencil.depth);
2354       return;
2355    case VK_FORMAT_S8_UINT:
2356       clear_value[0] = val->depthStencil.stencil;
2357       return;
2358    default:
2359       break;
2360    }
2361 
2362    float tmp[4];
2363    memcpy(tmp, val->color.float32, 4 * sizeof(float));
2364    if (vk_format_is_srgb(format)) {
2365       for (int i = 0; i < 3; i++)
2366          tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
2367    }
2368 
2369 #define PACK_F(type) util_format_##type##_pack_rgba_float \
2370    ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
2371    switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
2372    case 4:
2373       PACK_F(r4g4b4a4_unorm);
2374       break;
2375    case 5:
2376       if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
2377          PACK_F(r5g6b5_unorm);
2378       else
2379          PACK_F(r5g5b5a1_unorm);
2380       break;
2381    case 8:
2382       if (vk_format_is_snorm(format))
2383          PACK_F(r8g8b8a8_snorm);
2384       else if (vk_format_is_unorm(format))
2385          PACK_F(r8g8b8a8_unorm);
2386       else
2387          pack_int8(clear_value, val->color.uint32);
2388       break;
2389    case 10:
2390       if (vk_format_is_int(format))
2391          pack_int10_2(clear_value, val->color.uint32);
2392       else
2393          PACK_F(r10g10b10a2_unorm);
2394       break;
2395    case 11:
2396       clear_value[0] = float3_to_r11g11b10f(val->color.float32);
2397       break;
2398    case 16:
2399       if (vk_format_is_snorm(format))
2400          PACK_F(r16g16b16a16_snorm);
2401       else if (vk_format_is_unorm(format))
2402          PACK_F(r16g16b16a16_unorm);
2403       else if (vk_format_is_float(format))
2404          PACK_F(r16g16b16a16_float);
2405       else
2406          pack_int16(clear_value, val->color.uint32);
2407       break;
2408    case 32:
2409       memcpy(clear_value, val->color.float32, 4 * sizeof(float));
2410       break;
2411    default:
2412       unreachable("unexpected channel size");
2413    }
2414 #undef PACK_F
2415 }
2416 
2417 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)2418 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2419                       struct tu_cs *cs,
2420                       VkFormat format,
2421                       uint8_t clear_mask,
2422                       uint32_t gmem_offset,
2423                       const VkClearValue *value)
2424 {
2425    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2426    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
2427 
2428    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
2429 
2430    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2431    tu_cs_emit(cs, gmem_offset);
2432 
2433    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2434    tu_cs_emit(cs, 0);
2435 
2436    uint32_t clear_vals[4] = {};
2437    pack_gmem_clear_value(value, format, clear_vals);
2438 
2439    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2440    tu_cs_emit_array(cs, clear_vals, 4);
2441 
2442    tu6_emit_event_write(cmd, cs, BLIT);
2443 }
2444 
2445 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t attachment,VkImageAspectFlags mask,const VkClearValue * value)2446 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2447                               struct tu_cs *cs,
2448                               uint32_t attachment,
2449                               VkImageAspectFlags mask,
2450                               const VkClearValue *value)
2451 {
2452    const struct tu_render_pass_attachment *att =
2453       &cmd->state.pass->attachments[attachment];
2454 
2455    trace_start_gmem_clear(&cmd->trace, cs);
2456 
2457    if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2458       if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2459          clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
2460       if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2461          clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
2462       return;
2463    }
2464 
2465    clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
2466 
2467    trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples);
2468 }
2469 
2470 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)2471 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2472                           uint32_t attachment_count,
2473                           const VkClearAttachment *attachments,
2474                           uint32_t rect_count,
2475                           const VkClearRect *rects)
2476 {
2477    const struct tu_subpass *subpass = cmd->state.subpass;
2478    struct tu_cs *cs = &cmd->draw_cs;
2479 
2480    /* TODO: swap the loops for smaller cmdstream */
2481    for (unsigned i = 0; i < rect_count; i++) {
2482       unsigned x1 = rects[i].rect.offset.x;
2483       unsigned y1 = rects[i].rect.offset.y;
2484       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2485       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2486 
2487       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2488       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2489       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2490 
2491       for (unsigned j = 0; j < attachment_count; j++) {
2492          uint32_t a;
2493          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2494             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2495          else
2496             a = subpass->depth_stencil_attachment.attachment;
2497 
2498          if (a == VK_ATTACHMENT_UNUSED)
2499                continue;
2500 
2501          tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2502                                        &attachments[j].clearValue);
2503       }
2504    }
2505 }
2506 
2507 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)2508 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2509                        uint32_t attachmentCount,
2510                        const VkClearAttachment *pAttachments,
2511                        uint32_t rectCount,
2512                        const VkClearRect *pRects)
2513 {
2514    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2515    struct tu_cs *cs = &cmd->draw_cs;
2516 
2517    /* sysmem path behaves like a draw, note we don't have a way of using different
2518     * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2519     */
2520    tu_emit_cache_flush_renderpass(cmd, cs);
2521 
2522    for (uint32_t j = 0; j < attachmentCount; j++) {
2523       if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
2524          continue;
2525       cmd->state.lrz.valid = false;
2526       cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
2527    }
2528 
2529    /* vkCmdClearAttachments is supposed to respect the predicate if active.
2530     * The easiest way to do this is to always use the 3d path, which always
2531     * works even with GMEM because it's just a simple draw using the existing
2532     * attachment state. However it seems that IGNORE_VISIBILITY draws must be
2533     * skipped in the binning pass, since otherwise they produce binning data
2534     * which isn't consumed and leads to the wrong binning data being read, so
2535     * condition on GMEM | SYSMEM.
2536     */
2537    if (cmd->state.predication_active) {
2538       tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |
2539                              CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2540       tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2541       tu_cond_exec_end(cs);
2542       return;
2543    }
2544 
2545    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2546    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2547    tu_cond_exec_end(cs);
2548 
2549    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2550    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2551    tu_cond_exec_end(cs);
2552 }
2553 
2554 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat format,VkImageAspectFlags clear_mask,const VkRenderPassBeginInfo * info,uint32_t a,bool separate_stencil)2555 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2556                         struct tu_cs *cs,
2557                         VkFormat format,
2558                         VkImageAspectFlags clear_mask,
2559                         const VkRenderPassBeginInfo *info,
2560                         uint32_t a,
2561                         bool separate_stencil)
2562 {
2563    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2564    const struct tu_image_view *iview = cmd->state.attachments[a];
2565    const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
2566    const struct blit_ops *ops = &r2d_ops;
2567    if (cmd->state.pass->attachments[a].samples > 1)
2568       ops = &r3d_ops;
2569 
2570    trace_start_sysmem_clear(&cmd->trace, cs);
2571 
2572    ops->setup(cmd, cs, format, clear_mask, 0, true, iview->ubwc_enabled,
2573               cmd->state.pass->attachments[a].samples);
2574    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2575    ops->clear_value(cs, format, &info->pClearValues[a]);
2576 
2577    for_each_layer(i, clear_views, fb->layers) {
2578       if (separate_stencil) {
2579          if (ops == &r3d_ops)
2580             r3d_dst_stencil(cs, iview, i);
2581          else
2582             r2d_dst_stencil(cs, iview, i);
2583       } else {
2584          ops->dst(cs, iview, i);
2585       }
2586       ops->run(cmd, cs);
2587    }
2588 
2589    ops->teardown(cmd, cs);
2590 
2591    trace_end_sysmem_clear(&cmd->trace, cs,
2592                           format, ops == &r3d_ops,
2593                           cmd->state.pass->attachments[a].samples);
2594 }
2595 
2596 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,const VkRenderPassBeginInfo * info)2597 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2598                            struct tu_cs *cs,
2599                            uint32_t a,
2600                            const VkRenderPassBeginInfo *info)
2601 {
2602    const struct tu_render_pass_attachment *attachment =
2603       &cmd->state.pass->attachments[a];
2604 
2605    if (!attachment->clear_mask)
2606       return;
2607 
2608    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2609       if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2610          clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2611                                  info, a, false);
2612       }
2613       if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2614          clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2615                                  info, a, true);
2616       }
2617    } else {
2618       clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2619                               info, a, false);
2620    }
2621 
2622    /* The spec doesn't explicitly say, but presumably the initial renderpass
2623     * clear is considered part of the renderpass, and therefore barriers
2624     * aren't required inside the subpass/renderpass.  Therefore we need to
2625     * flush CCU color into CCU depth here, just like with
2626     * vkCmdClearAttachments(). Note that because this only happens at the
2627     * beginning of a renderpass, and renderpass writes are considered
2628     * "incoherent", we shouldn't have to worry about syncing depth into color
2629     * beforehand as depth should already be flushed.
2630     */
2631    if (vk_format_is_depth_or_stencil(attachment->format)) {
2632       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2633       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2634    } else {
2635       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2636       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2637    }
2638 
2639    if (cmd->device->physical_device->info->a6xx.has_ccu_flush_bug)
2640       tu_cs_emit_wfi(cs);
2641 }
2642 
2643 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,const VkRenderPassBeginInfo * info)2644 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2645                          struct tu_cs *cs,
2646                          uint32_t a,
2647                          const VkRenderPassBeginInfo *info)
2648 {
2649    const struct tu_render_pass_attachment *attachment =
2650       &cmd->state.pass->attachments[a];
2651 
2652    if (!attachment->clear_mask)
2653       return;
2654 
2655    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2656 
2657    tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2658                                  &info->pClearValues[a]);
2659 }
2660 
2661 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,bool resolve,bool separate_stencil)2662 tu_emit_blit(struct tu_cmd_buffer *cmd,
2663              struct tu_cs *cs,
2664              const struct tu_image_view *iview,
2665              const struct tu_render_pass_attachment *attachment,
2666              bool resolve,
2667              bool separate_stencil)
2668 {
2669    tu_cs_emit_regs(cs,
2670                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2671 
2672    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2673       .unk0 = !resolve,
2674       .gmem = !resolve,
2675       .sample_0 = vk_format_is_int(attachment->format) |
2676          vk_format_is_depth_or_stencil(attachment->format)));
2677 
2678    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2679    if (separate_stencil) {
2680       tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2681       tu_cs_emit_qw(cs, iview->stencil_base_addr);
2682       tu_cs_emit(cs, iview->stencil_PITCH);
2683 
2684       tu_cs_emit_regs(cs,
2685                       A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
2686    } else {
2687       tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2688       tu_cs_image_ref_2d(cs, iview, 0, false);
2689 
2690       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
2691       tu_cs_image_flag_ref(cs, iview, 0);
2692 
2693       tu_cs_emit_regs(cs,
2694                       A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2695    }
2696 
2697    tu6_emit_event_write(cmd, cs, BLIT);
2698 }
2699 
2700 static bool
blit_can_resolve(VkFormat format)2701 blit_can_resolve(VkFormat format)
2702 {
2703    const struct util_format_description *desc = vk_format_description(format);
2704 
2705    /* blit event can only do resolve for simple cases:
2706     * averaging samples as unsigned integers or choosing only one sample
2707     */
2708    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2709       return false;
2710 
2711    /* can't do formats with larger channel sizes
2712     * note: this includes all float formats
2713     * note2: single channel integer formats seem OK
2714     */
2715    if (desc->channel[0].size > 10)
2716       return false;
2717 
2718    switch (format) {
2719    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2720     * likely related to these formats having different layout from other cpp=2 formats
2721     */
2722    case VK_FORMAT_R8G8_UNORM:
2723    case VK_FORMAT_R8G8_UINT:
2724    case VK_FORMAT_R8G8_SINT:
2725    /* TODO: this one should be able to work? */
2726    case VK_FORMAT_D24_UNORM_S8_UINT:
2727       return false;
2728    default:
2729       break;
2730    }
2731 
2732    return true;
2733 }
2734 
2735 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,bool force_load)2736 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2737                         struct tu_cs *cs,
2738                         uint32_t a,
2739                         bool force_load)
2740 {
2741    const struct tu_image_view *iview = cmd->state.attachments[a];
2742    const struct tu_render_pass_attachment *attachment =
2743       &cmd->state.pass->attachments[a];
2744 
2745    trace_start_gmem_load(&cmd->trace, cs);
2746 
2747    if (attachment->load || force_load)
2748       tu_emit_blit(cmd, cs, iview, attachment, false, false);
2749 
2750    if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
2751       tu_emit_blit(cmd, cs, iview, attachment, false, true);
2752 
2753    trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load);
2754 }
2755 
2756 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t samples,bool separate_stencil,VkFormat format,uint32_t gmem_offset,uint32_t cpp)2757 store_cp_blit(struct tu_cmd_buffer *cmd,
2758               struct tu_cs *cs,
2759               const struct tu_image_view *iview,
2760               uint32_t samples,
2761               bool separate_stencil,
2762               VkFormat format,
2763               uint32_t gmem_offset,
2764               uint32_t cpp)
2765 {
2766    r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
2767                     iview->ubwc_enabled, true);
2768    if (separate_stencil)
2769       r2d_dst_stencil(cs, iview, 0);
2770    else
2771       r2d_dst(cs, iview, 0);
2772 
2773    tu_cs_emit_regs(cs,
2774                    A6XX_SP_PS_2D_SRC_INFO(
2775                       .color_format = tu6_format_texture(format, TILE6_2).fmt,
2776                       .tile_mode = TILE6_2,
2777                       .srgb = vk_format_is_srgb(format),
2778                       .samples = tu_msaa_samples(samples),
2779                       .samples_average = !vk_format_is_int(format) &&
2780                                          !vk_format_is_depth_or_stencil(format),
2781                       .unk20 = 1,
2782                       .unk22 = 1),
2783                    /* note: src size does not matter when not scaling */
2784                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2785                    A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset),
2786                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
2787 
2788    /* sync GMEM writes with CACHE. */
2789    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2790 
2791    /* Wait for CACHE_INVALIDATE to land */
2792    tu_cs_emit_wfi(cs);
2793 
2794    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2795    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2796 
2797    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2798     * sysmem, and we generally assume that GMEM renderpasses leave their
2799     * results in sysmem, so we need to flush manually here.
2800     */
2801    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2802 }
2803 
2804 static void
store_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t dst_samples,bool separate_stencil,VkFormat format,const VkRect2D * render_area,uint32_t gmem_offset,uint32_t cpp)2805 store_3d_blit(struct tu_cmd_buffer *cmd,
2806               struct tu_cs *cs,
2807               const struct tu_image_view *iview,
2808               uint32_t dst_samples,
2809               bool separate_stencil,
2810               VkFormat format,
2811               const VkRect2D *render_area,
2812               uint32_t gmem_offset,
2813               uint32_t cpp)
2814 {
2815    r3d_setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
2816              iview->ubwc_enabled, dst_samples);
2817 
2818    r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2819 
2820    if (separate_stencil)
2821       r3d_dst_stencil(cs, iview, 0);
2822    else
2823       r3d_dst(cs, iview, 0);
2824 
2825    r3d_src_gmem(cmd, cs, iview, format, gmem_offset, cpp);
2826 
2827    /* sync GMEM writes with CACHE. */
2828    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2829 
2830    r3d_run(cmd, cs);
2831 
2832    /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2833     * sysmem, and we generally assume that GMEM renderpasses leave their
2834     * results in sysmem, so we need to flush manually here. The 3d blit path
2835     * writes to depth images as a color RT, so there's no need to flush depth.
2836     */
2837    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2838 }
2839 
2840 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,uint32_t gmem_a)2841 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2842                          struct tu_cs *cs,
2843                          uint32_t a,
2844                          uint32_t gmem_a)
2845 {
2846    struct tu_physical_device *phys_dev = cmd->device->physical_device;
2847    const VkRect2D *render_area = &cmd->state.render_area;
2848    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2849    const struct tu_image_view *iview = cmd->state.attachments[a];
2850    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2851 
2852    if (!dst->store && !dst->store_stencil)
2853       return;
2854 
2855    uint32_t x1 = render_area->offset.x;
2856    uint32_t y1 = render_area->offset.y;
2857    uint32_t x2 = x1 + render_area->extent.width;
2858    uint32_t y2 = y1 + render_area->extent.height;
2859    /* x2/y2 can be unaligned if equal to the size of the image,
2860     * since it will write into padding space
2861     * the one exception is linear levels which don't have the
2862     * required y padding in the layout (except for the last level)
2863     */
2864    bool need_y2_align =
2865       y2 != iview->extent.height || iview->need_y2_align;
2866 
2867    bool unaligned =
2868       x1 % phys_dev->info->gmem_align_w ||
2869       (x2 % phys_dev->info->gmem_align_w && x2 != iview->extent.width) ||
2870       y1 % phys_dev->info->gmem_align_h || (y2 % phys_dev->info->gmem_align_h && need_y2_align);
2871 
2872    /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
2873     * one for depth and other for stencil. When resolving a MSAA
2874     * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
2875     */
2876    bool resolve_d32s8_s8 =
2877       src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
2878       dst->format == VK_FORMAT_S8_UINT;
2879 
2880    trace_start_gmem_store(&cmd->trace, cs);
2881 
2882    /* use fast path when render area is aligned, except for unsupported resolve cases */
2883    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2884       if (dst->store)
2885          tu_emit_blit(cmd, cs, iview, src, true, resolve_d32s8_s8);
2886       if (dst->store_stencil)
2887          tu_emit_blit(cmd, cs, iview, src, true, true);
2888 
2889       trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false);
2890       return;
2891    }
2892 
2893    VkFormat format = src->format;
2894    if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
2895       format = VK_FORMAT_D32_SFLOAT;
2896 
2897    if (dst->samples > 1) {
2898       /* If we hit this path, we have to disable draw states after every tile
2899        * instead of once at the end of the renderpass, so that they aren't
2900        * executed when calling CP_DRAW.
2901        *
2902        * TODO: store a flag somewhere so we don't do this more than once and
2903        * don't do it after the renderpass when this happens.
2904        */
2905       if (dst->store || dst->store_stencil)
2906          tu_disable_draw_states(cmd, cs);
2907 
2908       if (dst->store) {
2909          store_3d_blit(cmd, cs, iview, dst->samples, resolve_d32s8_s8, format,
2910                        render_area, src->gmem_offset, src->cpp);
2911       }
2912       if (dst->store_stencil) {
2913          store_3d_blit(cmd, cs, iview, dst->samples, true, VK_FORMAT_S8_UINT,
2914                        render_area, src->gmem_offset, src->samples);
2915       }
2916    } else {
2917       r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2918 
2919       if (dst->store) {
2920          store_cp_blit(cmd, cs, iview, src->samples, resolve_d32s8_s8, format,
2921                        src->gmem_offset, src->cpp);
2922       }
2923       if (dst->store_stencil) {
2924          store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
2925                        src->gmem_offset_stencil, src->samples);
2926       }
2927    }
2928 
2929    trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned);
2930 }
2931