1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "ir3/ir3_nir.h"
15
16 #include "util/format_r11g11b10f.h"
17 #include "util/format_rgb9e5.h"
18 #include "util/format_srgb.h"
19 #include "util/half_float.h"
20 #include "compiler/nir/nir_builder.h"
21
22 #include "tu_tracepoints.h"
23
24 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)25 tu_pack_float32_for_unorm(float val, int bits)
26 {
27 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
28 }
29
30 /* r2d_ = BLIT_OP_SCALE operations */
31
32 static enum a6xx_2d_ifmt
format_to_ifmt(VkFormat format)33 format_to_ifmt(VkFormat format)
34 {
35 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
36 format == VK_FORMAT_X8_D24_UNORM_PACK32)
37 return R2D_UNORM8;
38
39 /* get_component_bits doesn't work with depth/stencil formats: */
40 if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT)
41 return R2D_FLOAT32;
42 if (format == VK_FORMAT_S8_UINT)
43 return R2D_INT8;
44
45 /* use the size of the red channel to find the corresponding "ifmt" */
46 bool is_int = vk_format_is_int(format);
47 switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
48 case 4: case 5: case 8:
49 return is_int ? R2D_INT8 : R2D_UNORM8;
50 case 10: case 11:
51 return is_int ? R2D_INT16 : R2D_FLOAT16;
52 case 16:
53 if (vk_format_is_float(format))
54 return R2D_FLOAT16;
55 return is_int ? R2D_INT16 : R2D_FLOAT32;
56 case 32:
57 return is_int ? R2D_INT32 : R2D_FLOAT32;
58 default:
59 unreachable("bad format");
60 return 0;
61 }
62 }
63
64 static void
r2d_coords(struct tu_cs * cs,const VkOffset2D * dst,const VkOffset2D * src,const VkExtent2D * extent)65 r2d_coords(struct tu_cs *cs,
66 const VkOffset2D *dst,
67 const VkOffset2D *src,
68 const VkExtent2D *extent)
69 {
70 tu_cs_emit_regs(cs,
71 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
72 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
73
74 if (!src)
75 return;
76
77 tu_cs_emit_regs(cs,
78 A6XX_GRAS_2D_SRC_TL_X(src->x),
79 A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
80 A6XX_GRAS_2D_SRC_TL_Y(src->y),
81 A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
82 }
83
84 static void
r2d_clear_value(struct tu_cs * cs,VkFormat format,const VkClearValue * val)85 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
86 {
87 uint32_t clear_value[4] = {};
88
89 switch (format) {
90 case VK_FORMAT_X8_D24_UNORM_PACK32:
91 case VK_FORMAT_D24_UNORM_S8_UINT:
92 /* cleared as r8g8b8a8_unorm using special format */
93 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
94 clear_value[1] = clear_value[0] >> 8;
95 clear_value[2] = clear_value[0] >> 16;
96 clear_value[3] = val->depthStencil.stencil;
97 break;
98 case VK_FORMAT_D16_UNORM:
99 case VK_FORMAT_D32_SFLOAT:
100 /* R2D_FLOAT32 */
101 clear_value[0] = fui(val->depthStencil.depth);
102 break;
103 case VK_FORMAT_S8_UINT:
104 clear_value[0] = val->depthStencil.stencil;
105 break;
106 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
107 /* cleared as UINT32 */
108 clear_value[0] = float3_to_rgb9e5(val->color.float32);
109 break;
110 default:
111 assert(!vk_format_is_depth_or_stencil(format));
112 const struct util_format_description *desc = vk_format_description(format);
113 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
114
115 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
116 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
117
118 for (unsigned i = 0; i < desc->nr_channels; i++) {
119 const struct util_format_channel_description *ch = &desc->channel[i];
120 if (ifmt == R2D_UNORM8) {
121 float linear = val->color.float32[i];
122 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
123 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
124
125 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
126 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
127 else
128 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
129 } else if (ifmt == R2D_FLOAT16) {
130 clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
131 } else {
132 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
133 ifmt == R2D_INT16 || ifmt == R2D_INT8);
134 clear_value[i] = val->color.uint32[i];
135 }
136 }
137 break;
138 }
139
140 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
141 tu_cs_emit_array(cs, clear_value, 4);
142 }
143
144 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)145 r2d_src(struct tu_cmd_buffer *cmd,
146 struct tu_cs *cs,
147 const struct tu_image_view *iview,
148 uint32_t layer,
149 VkFilter filter)
150 {
151 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
152 if (filter != VK_FILTER_NEAREST)
153 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
154
155 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
156 tu_cs_emit(cs, src_info);
157 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
158 tu_cs_image_ref_2d(cs, iview, layer, true);
159
160 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);
161 tu_cs_image_flag_ref(cs, iview, layer);
162 }
163
164 static void
r2d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)165 r2d_src_stencil(struct tu_cmd_buffer *cmd,
166 struct tu_cs *cs,
167 const struct tu_image_view *iview,
168 uint32_t layer,
169 VkFilter filter)
170 {
171 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
172 tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
173 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
174 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
175 /* SP_PS_2D_SRC_PITCH has shifted pitch field */
176 tu_cs_emit(cs, iview->stencil_PITCH << 9);
177 }
178
179 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height)180 r2d_src_buffer(struct tu_cmd_buffer *cmd,
181 struct tu_cs *cs,
182 VkFormat vk_format,
183 uint64_t va, uint32_t pitch,
184 uint32_t width, uint32_t height)
185 {
186 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
187
188 tu_cs_emit_regs(cs,
189 A6XX_SP_PS_2D_SRC_INFO(
190 .color_format = format.fmt,
191 .color_swap = format.swap,
192 .srgb = vk_format_is_srgb(vk_format),
193 .unk20 = 1,
194 .unk22 = 1),
195 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
196 A6XX_SP_PS_2D_SRC(.qword = va),
197 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
198 }
199
200 static void
r2d_dst(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)201 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
202 {
203 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
204 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
205 tu_cs_image_ref_2d(cs, iview, layer, false);
206
207 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
208 tu_cs_image_flag_ref(cs, iview, layer);
209 }
210
211 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)212 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
213 {
214 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
215 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
216 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
217 tu_cs_emit(cs, iview->stencil_PITCH);
218 }
219
220 static void
r2d_dst_buffer(struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch)221 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
222 {
223 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
224
225 tu_cs_emit_regs(cs,
226 A6XX_RB_2D_DST_INFO(
227 .color_format = format.fmt,
228 .color_swap = format.swap,
229 .srgb = vk_format_is_srgb(vk_format)),
230 A6XX_RB_2D_DST(.qword = va),
231 A6XX_RB_2D_DST_PITCH(pitch));
232 }
233
234 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,bool scissor)235 r2d_setup_common(struct tu_cmd_buffer *cmd,
236 struct tu_cs *cs,
237 VkFormat vk_format,
238 VkImageAspectFlags aspect_mask,
239 unsigned blit_param,
240 bool clear,
241 bool ubwc,
242 bool scissor)
243 {
244 enum a6xx_format format = tu6_base_format(vk_format);
245 enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format);
246 uint32_t unknown_8c01 = 0;
247
248 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
249 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
250 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
251 }
252
253 /* note: the only format with partial clearing is D24S8 */
254 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
255 /* preserve stencil channel */
256 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
257 unknown_8c01 = 0x08000041;
258 /* preserve depth channels */
259 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
260 unknown_8c01 = 0x00084001;
261 }
262
263 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
264 tu_cs_emit(cs, unknown_8c01);
265
266 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
267 .scissor = scissor,
268 .rotate = blit_param,
269 .solid_color = clear,
270 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
271 .color_format = format,
272 .mask = 0xf,
273 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
274 ).value;
275
276 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
277 tu_cs_emit(cs, blit_cntl);
278
279 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
280 tu_cs_emit(cs, blit_cntl);
281
282 if (format == FMT6_10_10_10_2_UNORM_DEST)
283 format = FMT6_16_16_16_16_FLOAT;
284
285 tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
286 .sint = vk_format_is_sint(vk_format),
287 .uint = vk_format_is_uint(vk_format),
288 .color_format = format,
289 .srgb = vk_format_is_srgb(vk_format),
290 .mask = 0xf));
291 }
292
293 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)294 r2d_setup(struct tu_cmd_buffer *cmd,
295 struct tu_cs *cs,
296 VkFormat vk_format,
297 VkImageAspectFlags aspect_mask,
298 unsigned blit_param,
299 bool clear,
300 bool ubwc,
301 VkSampleCountFlagBits samples)
302 {
303 assert(samples == VK_SAMPLE_COUNT_1_BIT);
304
305 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
306
307 r2d_setup_common(cmd, cs, vk_format, aspect_mask, blit_param, clear, ubwc, false);
308 }
309
310 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)311 r2d_teardown(struct tu_cmd_buffer *cmd,
312 struct tu_cs *cs)
313 {
314 /* nothing to do here */
315 }
316
317 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)318 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
319 {
320 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
321 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
322 }
323
324 /* r3d_ = shader path operations */
325
326 static nir_ssa_def *
load_const(nir_builder * b,unsigned base,unsigned components)327 load_const(nir_builder *b, unsigned base, unsigned components)
328 {
329 return nir_load_uniform(b, components, 32, nir_imm_int(b, 0),
330 .base = base);
331 }
332
333 static nir_shader *
build_blit_vs_shader(void)334 build_blit_vs_shader(void)
335 {
336 nir_builder _b =
337 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
338 nir_builder *b = &_b;
339
340 nir_variable *out_pos =
341 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
342 "gl_Position");
343 out_pos->data.location = VARYING_SLOT_POS;
344
345 nir_ssa_def *vert0_pos = load_const(b, 0, 2);
346 nir_ssa_def *vert1_pos = load_const(b, 4, 2);
347 nir_ssa_def *vertex = nir_load_vertex_id(b);
348
349 nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
350 pos = nir_vec4(b, nir_channel(b, pos, 0),
351 nir_channel(b, pos, 1),
352 nir_imm_float(b, 0.0),
353 nir_imm_float(b, 1.0));
354
355 nir_store_var(b, out_pos, pos, 0xf);
356
357 nir_variable *out_coords =
358 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
359 "coords");
360 out_coords->data.location = VARYING_SLOT_VAR0;
361
362 nir_ssa_def *vert0_coords = load_const(b, 2, 2);
363 nir_ssa_def *vert1_coords = load_const(b, 6, 2);
364
365 /* Only used with "z scale" blit path which uses a 3d texture */
366 nir_ssa_def *z_coord = load_const(b, 8, 1);
367
368 nir_ssa_def *coords = nir_bcsel(b, nir_i2b1(b, vertex), vert1_coords, vert0_coords);
369 coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
370 z_coord);
371
372 nir_store_var(b, out_coords, coords, 0x7);
373
374 return b->shader;
375 }
376
377 static nir_shader *
build_clear_vs_shader(void)378 build_clear_vs_shader(void)
379 {
380 nir_builder _b =
381 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
382 nir_builder *b = &_b;
383
384 nir_variable *out_pos =
385 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
386 "gl_Position");
387 out_pos->data.location = VARYING_SLOT_POS;
388
389 nir_ssa_def *vert0_pos = load_const(b, 0, 2);
390 nir_ssa_def *vert1_pos = load_const(b, 4, 2);
391 /* c0.z is used to clear depth */
392 nir_ssa_def *depth = load_const(b, 2, 1);
393 nir_ssa_def *vertex = nir_load_vertex_id(b);
394
395 nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
396 pos = nir_vec4(b, nir_channel(b, pos, 0),
397 nir_channel(b, pos, 1),
398 depth, nir_imm_float(b, 1.0));
399
400 nir_store_var(b, out_pos, pos, 0xf);
401
402 nir_variable *out_layer =
403 nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
404 "gl_Layer");
405 out_layer->data.location = VARYING_SLOT_LAYER;
406 nir_ssa_def *layer = load_const(b, 3, 1);
407 nir_store_var(b, out_layer, layer, 1);
408
409 return b->shader;
410 }
411
412 static nir_shader *
build_blit_fs_shader(bool zscale)413 build_blit_fs_shader(bool zscale)
414 {
415 nir_builder _b =
416 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
417 zscale ? "zscale blit fs" : "blit fs");
418 nir_builder *b = &_b;
419
420 nir_variable *out_color =
421 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
422 "color0");
423 out_color->data.location = FRAG_RESULT_DATA0;
424
425 unsigned coord_components = zscale ? 3 : 2;
426 nir_variable *in_coords =
427 nir_variable_create(b->shader, nir_var_shader_in,
428 glsl_vec_type(coord_components),
429 "coords");
430 in_coords->data.location = VARYING_SLOT_VAR0;
431
432 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
433 /* Note: since we're just copying data, we rely on the HW ignoring the
434 * dest_type.
435 */
436 tex->dest_type = nir_type_int32;
437 tex->is_array = false;
438 tex->is_shadow = false;
439 tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
440
441 tex->texture_index = 0;
442 tex->sampler_index = 0;
443
444 b->shader->info.num_textures = 1;
445 BITSET_SET(b->shader->info.textures_used, 0);
446
447 tex->src[0].src_type = nir_tex_src_coord;
448 tex->src[0].src = nir_src_for_ssa(nir_load_var(b, in_coords));
449 tex->coord_components = coord_components;
450
451 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
452 nir_builder_instr_insert(b, &tex->instr);
453
454 nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
455
456 return b->shader;
457 }
458
459 /* We can only read multisample textures via txf_ms, so we need a separate
460 * variant for them.
461 */
462 static nir_shader *
build_ms_copy_fs_shader(void)463 build_ms_copy_fs_shader(void)
464 {
465 nir_builder _b =
466 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
467 "multisample copy fs");
468 nir_builder *b = &_b;
469
470 nir_variable *out_color =
471 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
472 "color0");
473 out_color->data.location = FRAG_RESULT_DATA0;
474
475 nir_variable *in_coords =
476 nir_variable_create(b->shader, nir_var_shader_in,
477 glsl_vec_type(2),
478 "coords");
479 in_coords->data.location = VARYING_SLOT_VAR0;
480
481 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
482
483 tex->op = nir_texop_txf_ms;
484
485 /* Note: since we're just copying data, we rely on the HW ignoring the
486 * dest_type.
487 */
488 tex->dest_type = nir_type_int32;
489 tex->is_array = false;
490 tex->is_shadow = false;
491 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
492
493 tex->texture_index = 0;
494 tex->sampler_index = 0;
495
496 b->shader->info.num_textures = 1;
497 BITSET_SET(b->shader->info.textures_used, 0);
498 BITSET_SET(b->shader->info.textures_used_by_txf, 0);
499
500 nir_ssa_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
501
502 tex->src[0].src_type = nir_tex_src_coord;
503 tex->src[0].src = nir_src_for_ssa(coord);
504 tex->coord_components = 2;
505
506 tex->src[1].src_type = nir_tex_src_ms_index;
507 tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b));
508
509 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
510 nir_builder_instr_insert(b, &tex->instr);
511
512 nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
513
514 return b->shader;
515 }
516
517 static nir_shader *
build_clear_fs_shader(unsigned mrts)518 build_clear_fs_shader(unsigned mrts)
519 {
520 nir_builder _b =
521 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
522 "mrt%u clear fs", mrts);
523 nir_builder *b = &_b;
524
525 for (unsigned i = 0; i < mrts; i++) {
526 nir_variable *out_color =
527 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
528 "color");
529 out_color->data.location = FRAG_RESULT_DATA0 + i;
530
531 nir_ssa_def *color = load_const(b, 4 * i, 4);
532 nir_store_var(b, out_color, color, 0xf);
533 }
534
535 return b->shader;
536 }
537
538 static void
compile_shader(struct tu_device * dev,struct nir_shader * nir,unsigned consts,unsigned * offset,enum global_shader idx)539 compile_shader(struct tu_device *dev, struct nir_shader *nir,
540 unsigned consts, unsigned *offset, enum global_shader idx)
541 {
542 nir->options = ir3_get_compiler_options(dev->compiler);
543
544 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
545 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
546
547 ir3_finalize_nir(dev->compiler, nir);
548
549 struct ir3_shader *sh = ir3_shader_from_nir(dev->compiler, nir,
550 align(consts, 4), NULL);
551
552 struct ir3_shader_key key = {};
553 bool created;
554 struct ir3_shader_variant *so =
555 ir3_shader_get_variant(sh, &key, false, false, &created);
556
557 struct tu6_global *global = dev->global_bo.map;
558
559 assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
560 dev->global_shaders[idx] = so;
561 memcpy(&global->shaders[*offset], so->bin,
562 sizeof(uint32_t) * so->info.sizedwords);
563 dev->global_shader_va[idx] = dev->global_bo.iova +
564 gb_offset(shaders[*offset]);
565 *offset += align(so->info.sizedwords, 32);
566 }
567
568 void
tu_init_clear_blit_shaders(struct tu_device * dev)569 tu_init_clear_blit_shaders(struct tu_device *dev)
570 {
571 unsigned offset = 0;
572 compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
573 compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
574 compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
575 compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
576 compile_shader(dev, build_ms_copy_fs_shader(), 0, &offset, GLOBAL_SH_FS_COPY_MS);
577
578 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
579 compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
580 GLOBAL_SH_FS_CLEAR0 + num_rts);
581 }
582 }
583
584 void
tu_destroy_clear_blit_shaders(struct tu_device * dev)585 tu_destroy_clear_blit_shaders(struct tu_device *dev)
586 {
587 for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
588 if (dev->global_shaders[i])
589 ir3_shader_destroy(dev->global_shaders[i]->shader);
590 }
591 }
592
593 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool blit,uint32_t rts_mask,bool z_scale,VkSampleCountFlagBits samples)594 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit,
595 uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
596 {
597 enum global_shader vs_id =
598 blit ? GLOBAL_SH_VS_BLIT : GLOBAL_SH_VS_CLEAR;
599
600 struct ir3_shader_variant *vs = cmd->device->global_shaders[vs_id];
601 uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
602
603 enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
604
605 if (z_scale)
606 fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
607 else if (samples != VK_SAMPLE_COUNT_1_BIT)
608 fs_id = GLOBAL_SH_FS_COPY_MS;
609
610 unsigned num_rts = util_bitcount(rts_mask);
611 if (!blit)
612 fs_id = GLOBAL_SH_FS_CLEAR0 + num_rts;
613
614 struct ir3_shader_variant *fs = cmd->device->global_shaders[fs_id];
615 uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
616
617 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
618 .vs_state = true,
619 .hs_state = true,
620 .ds_state = true,
621 .gs_state = true,
622 .fs_state = true,
623 .cs_state = true,
624 .gfx_ibo = true,
625 .cs_ibo = true,
626 .gfx_shared_const = true,
627 .gfx_bindless = 0x1f,
628 .cs_bindless = 0x1f));
629
630 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs);
631 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL);
632 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL);
633 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL);
634 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, fs);
635
636 struct tu_pvtmem_config pvtmem = {};
637 tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
638 tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
639
640 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
641 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
642
643 if (cmd->device->physical_device->info->a6xx.has_cp_reg_write) {
644 /* Copy what the blob does here. This will emit an extra 0x3f
645 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
646 * this is working around yet.
647 */
648 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
649 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
650 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
651 tu_cs_emit(cs, 0);
652 } else {
653 tu_cs_emit_regs(cs, A6XX_PC_MULTIVIEW_CNTL());
654 }
655 tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());
656
657 tu6_emit_vpc(cs, vs, NULL, NULL, NULL, fs, 0);
658
659 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
660 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
661 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
662
663 tu6_emit_fs_inputs(cs, fs);
664
665 tu_cs_emit_regs(cs,
666 A6XX_GRAS_CL_CNTL(
667 .persp_division_disable = 1,
668 .vp_xform_disable = 1,
669 .vp_clip_code_ignore = 1,
670 .clip_disable = 1));
671 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
672
673 tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL());
674 tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
675
676 tu_cs_emit_regs(cs,
677 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
678 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
679 tu_cs_emit_regs(cs,
680 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
681 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
682
683 tu_cs_emit_regs(cs,
684 A6XX_VFD_INDEX_OFFSET(),
685 A6XX_VFD_INSTANCE_START_OFFSET());
686
687 if (rts_mask) {
688 unsigned rts_count = util_last_bit(rts_mask);
689 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
690 unsigned rt = 0;
691 for (unsigned i = 0; i < rts_count; i++) {
692 unsigned regid = 0;
693 if (rts_mask & (1u << i))
694 regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
695 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid));
696 }
697 }
698
699 cmd->state.line_mode = RECTANGULAR;
700 tu6_emit_msaa(cs, samples, cmd->state.line_mode);
701 }
702
703 static void
r3d_coords_raw(struct tu_cs * cs,const float * coords)704 r3d_coords_raw(struct tu_cs *cs, const float *coords)
705 {
706 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
707 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
708 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
709 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
710 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
711 CP_LOAD_STATE6_0_NUM_UNIT(2));
712 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
713 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
714 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
715 }
716
717 /* z coordinate for "z scale" blit path which uses a 3d texture */
718 static void
r3d_coord_z(struct tu_cs * cs,float z)719 r3d_coord_z(struct tu_cs *cs, float z)
720 {
721 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 4);
722 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(2) |
723 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
724 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
725 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
726 CP_LOAD_STATE6_0_NUM_UNIT(1));
727 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
728 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
729 tu_cs_emit(cs, fui(z));
730 tu_cs_emit(cs, 0);
731 tu_cs_emit(cs, 0);
732 tu_cs_emit(cs, 0);
733 }
734
735 static void
r3d_coords(struct tu_cs * cs,const VkOffset2D * dst,const VkOffset2D * src,const VkExtent2D * extent)736 r3d_coords(struct tu_cs *cs,
737 const VkOffset2D *dst,
738 const VkOffset2D *src,
739 const VkExtent2D *extent)
740 {
741 int32_t src_x1 = src ? src->x : 0;
742 int32_t src_y1 = src ? src->y : 0;
743 r3d_coords_raw(cs, (float[]) {
744 dst->x, dst->y,
745 src_x1, src_y1,
746 dst->x + extent->width, dst->y + extent->height,
747 src_x1 + extent->width, src_y1 + extent->height,
748 });
749 }
750
751 static void
r3d_clear_value(struct tu_cs * cs,VkFormat format,const VkClearValue * val)752 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
753 {
754 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
755 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
756 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
757 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
758 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
759 CP_LOAD_STATE6_0_NUM_UNIT(1));
760 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
761 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
762 switch (format) {
763 case VK_FORMAT_X8_D24_UNORM_PACK32:
764 case VK_FORMAT_D24_UNORM_S8_UINT: {
765 /* cleared as r8g8b8a8_unorm using special format */
766 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
767 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
768 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
769 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
770 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
771 } break;
772 case VK_FORMAT_D16_UNORM:
773 case VK_FORMAT_D32_SFLOAT:
774 tu_cs_emit(cs, fui(val->depthStencil.depth));
775 tu_cs_emit(cs, 0);
776 tu_cs_emit(cs, 0);
777 tu_cs_emit(cs, 0);
778 break;
779 case VK_FORMAT_S8_UINT:
780 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
781 tu_cs_emit(cs, 0);
782 tu_cs_emit(cs, 0);
783 tu_cs_emit(cs, 0);
784 break;
785 default:
786 /* as color formats use clear value as-is */
787 assert(!vk_format_is_depth_or_stencil(format));
788 tu_cs_emit_array(cs, val->color.uint32, 4);
789 break;
790 }
791 }
792
793 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)794 r3d_src_common(struct tu_cmd_buffer *cmd,
795 struct tu_cs *cs,
796 const uint32_t *tex_const,
797 uint32_t offset_base,
798 uint32_t offset_ubwc,
799 VkFilter filter)
800 {
801 struct tu_cs_memory texture = { };
802 VkResult result = tu_cs_alloc(&cmd->sub_cs,
803 2, /* allocate space for a sampler too */
804 A6XX_TEX_CONST_DWORDS, &texture);
805 if (result != VK_SUCCESS) {
806 cmd->record_result = result;
807 return;
808 }
809
810 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
811
812 /* patch addresses for layer offset */
813 *(uint64_t*) (texture.map + 4) += offset_base;
814 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
815 texture.map[7] = ubwc_addr;
816 texture.map[8] = ubwc_addr >> 32;
817
818 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
819 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
820 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
821 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
822 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
823 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
824 0x60000; /* XXX used by blob, doesn't seem necessary */
825 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
826 0x1 | /* XXX used by blob, doesn't seem necessary */
827 A6XX_TEX_SAMP_1_UNNORM_COORDS |
828 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
829 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
830 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
831
832 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
833 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
834 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
835 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
836 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
837 CP_LOAD_STATE6_0_NUM_UNIT(1));
838 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
839
840 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
841
842 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
843 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
844 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
845 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
846 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
847 CP_LOAD_STATE6_0_NUM_UNIT(1));
848 tu_cs_emit_qw(cs, texture.iova);
849
850 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
851 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
852 }
853
854 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)855 r3d_src(struct tu_cmd_buffer *cmd,
856 struct tu_cs *cs,
857 const struct tu_image_view *iview,
858 uint32_t layer,
859 VkFilter filter)
860 {
861 r3d_src_common(cmd, cs, iview->descriptor,
862 iview->layer_size * layer,
863 iview->ubwc_layer_size * layer,
864 filter);
865 }
866
867 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height)868 r3d_src_buffer(struct tu_cmd_buffer *cmd,
869 struct tu_cs *cs,
870 VkFormat vk_format,
871 uint64_t va, uint32_t pitch,
872 uint32_t width, uint32_t height)
873 {
874 uint32_t desc[A6XX_TEX_CONST_DWORDS];
875
876 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
877
878 desc[0] =
879 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
880 A6XX_TEX_CONST_0_FMT(format.fmt) |
881 A6XX_TEX_CONST_0_SWAP(format.swap) |
882 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
883 // XXX to swizzle into .w for stencil buffer_to_image
884 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
885 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
886 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
887 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
888 desc[2] =
889 A6XX_TEX_CONST_2_PITCH(pitch) |
890 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
891 desc[3] = 0;
892 desc[4] = va;
893 desc[5] = va >> 32;
894 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
895 desc[i] = 0;
896
897 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
898 }
899
900 static void
r3d_src_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,VkFormat format,uint32_t gmem_offset,uint32_t cpp)901 r3d_src_gmem(struct tu_cmd_buffer *cmd,
902 struct tu_cs *cs,
903 const struct tu_image_view *iview,
904 VkFormat format,
905 uint32_t gmem_offset,
906 uint32_t cpp)
907 {
908 uint32_t desc[A6XX_TEX_CONST_DWORDS];
909 memcpy(desc, iview->descriptor, sizeof(desc));
910
911 /* patch the format so that depth/stencil get the right format */
912 desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
913 desc[0] |= A6XX_TEX_CONST_0_FMT(tu6_format_texture(format, TILE6_2).fmt);
914
915 /* patched for gmem */
916 desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
917 desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
918 desc[2] =
919 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
920 A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);
921 desc[3] = 0;
922 desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
923 desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
924 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
925 desc[i] = 0;
926
927 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
928 }
929
930 static void
r3d_dst(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)931 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
932 {
933 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
934 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
935 tu_cs_image_ref(cs, iview, layer);
936 tu_cs_emit(cs, 0);
937
938 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
939 tu_cs_image_flag_ref(cs, iview, layer);
940
941 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
942 }
943
944 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)945 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
946 {
947 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
948 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
949 tu_cs_image_stencil_ref(cs, iview, layer);
950 tu_cs_emit(cs, 0);
951
952 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
953 }
954
955 static void
r3d_dst_buffer(struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch)956 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
957 {
958 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
959
960 tu_cs_emit_regs(cs,
961 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
962 A6XX_RB_MRT_PITCH(0, pitch),
963 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
964 A6XX_RB_MRT_BASE(0, .qword = va),
965 A6XX_RB_MRT_BASE_GMEM(0, 0));
966
967 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
968 }
969
970 static uint8_t
aspect_write_mask(VkFormat vk_format,VkImageAspectFlags aspect_mask)971 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
972 {
973 uint8_t mask = 0xf;
974 assert(aspect_mask);
975 /* note: the only format with partial writing is D24S8,
976 * clear/blit uses the _AS_R8G8B8A8 format to access it
977 */
978 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
979 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
980 mask = 0x7;
981 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
982 mask = 0x8;
983 }
984 return mask;
985 }
986
987 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)988 r3d_setup(struct tu_cmd_buffer *cmd,
989 struct tu_cs *cs,
990 VkFormat vk_format,
991 VkImageAspectFlags aspect_mask,
992 unsigned blit_param,
993 bool clear,
994 bool ubwc,
995 VkSampleCountFlagBits samples)
996 {
997 enum a6xx_format format = tu6_base_format(vk_format);
998
999 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
1000 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
1001 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
1002 }
1003
1004 if (!cmd->state.pass) {
1005 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1006 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1007 }
1008
1009 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
1010 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
1011
1012 r3d_common(cmd, cs, !clear, 1, blit_param, samples);
1013
1014 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1015 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1016 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1017 0xfc000000);
1018 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
1019
1020 tu_cs_emit_regs(cs,
1021 A6XX_RB_FS_OUTPUT_CNTL0(),
1022 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1023
1024 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1025 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1026
1027 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1028 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1029 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1030 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1031 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1032 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1033 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1034
1035 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
1036 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
1037
1038 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1039 .color_format = format,
1040 .color_sint = vk_format_is_sint(vk_format),
1041 .color_uint = vk_format_is_uint(vk_format)));
1042
1043 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1044 .component_enable = aspect_write_mask(vk_format, aspect_mask)));
1045 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
1046 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
1047
1048 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1049 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1050
1051 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1052 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1053
1054 if (cmd->state.predication_active) {
1055 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1056 tu_cs_emit(cs, 0);
1057 }
1058 }
1059
1060 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1061 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1062 {
1063 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1064 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1065 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1066 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1067 tu_cs_emit(cs, 1); /* instance count */
1068 tu_cs_emit(cs, 2); /* vertex count */
1069 }
1070
1071 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1072 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1073 {
1074 if (cmd->state.predication_active) {
1075 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1076 tu_cs_emit(cs, 1);
1077 }
1078 }
1079
1080 /* blit ops - common interface for 2d/shader paths */
1081
1082 struct blit_ops {
1083 void (*coords)(struct tu_cs *cs,
1084 const VkOffset2D *dst,
1085 const VkOffset2D *src,
1086 const VkExtent2D *extent);
1087 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
1088 void (*src)(
1089 struct tu_cmd_buffer *cmd,
1090 struct tu_cs *cs,
1091 const struct tu_image_view *iview,
1092 uint32_t layer,
1093 VkFilter filter);
1094 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1095 VkFormat vk_format,
1096 uint64_t va, uint32_t pitch,
1097 uint32_t width, uint32_t height);
1098 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1099 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1100 void (*setup)(struct tu_cmd_buffer *cmd,
1101 struct tu_cs *cs,
1102 VkFormat vk_format,
1103 VkImageAspectFlags aspect_mask,
1104 unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1105 bool clear,
1106 bool ubwc,
1107 VkSampleCountFlagBits samples);
1108 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1109 void (*teardown)(struct tu_cmd_buffer *cmd,
1110 struct tu_cs *cs);
1111 };
1112
1113 static const struct blit_ops r2d_ops = {
1114 .coords = r2d_coords,
1115 .clear_value = r2d_clear_value,
1116 .src = r2d_src,
1117 .src_buffer = r2d_src_buffer,
1118 .dst = r2d_dst,
1119 .dst_buffer = r2d_dst_buffer,
1120 .setup = r2d_setup,
1121 .run = r2d_run,
1122 .teardown = r2d_teardown,
1123 };
1124
1125 static const struct blit_ops r3d_ops = {
1126 .coords = r3d_coords,
1127 .clear_value = r3d_clear_value,
1128 .src = r3d_src,
1129 .src_buffer = r3d_src_buffer,
1130 .dst = r3d_dst,
1131 .dst_buffer = r3d_dst_buffer,
1132 .setup = r3d_setup,
1133 .run = r3d_run,
1134 .teardown = r3d_teardown,
1135 };
1136
1137 /* passthrough set coords from 3D extents */
1138 static void
coords(const struct blit_ops * ops,struct tu_cs * cs,const VkOffset3D * dst,const VkOffset3D * src,const VkExtent3D * extent)1139 coords(const struct blit_ops *ops,
1140 struct tu_cs *cs,
1141 const VkOffset3D *dst,
1142 const VkOffset3D *src,
1143 const VkExtent3D *extent)
1144 {
1145 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1146 }
1147
1148 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1149 * to be a bit careful because we have to pick a format with matching UBWC
1150 * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1151 * everything.
1152 */
1153 static VkFormat
copy_format(VkFormat format,VkImageAspectFlags aspect_mask,bool copy_buffer)1154 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
1155 {
1156 if (vk_format_is_compressed(format)) {
1157 switch (vk_format_get_blocksize(format)) {
1158 case 1: return VK_FORMAT_R8_UINT;
1159 case 2: return VK_FORMAT_R16_UINT;
1160 case 4: return VK_FORMAT_R32_UINT;
1161 case 8: return VK_FORMAT_R32G32_UINT;
1162 case 16:return VK_FORMAT_R32G32B32A32_UINT;
1163 default:
1164 unreachable("unhandled format size");
1165 }
1166 }
1167
1168 switch (format) {
1169 /* For SNORM formats, copy them as the equivalent UNORM format. If we treat
1170 * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1171 * (also -1.0), when we're supposed to be memcpying the bits. See
1172 * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1173 */
1174 case VK_FORMAT_R8_SNORM:
1175 return VK_FORMAT_R8_UNORM;
1176 case VK_FORMAT_R8G8_SNORM:
1177 return VK_FORMAT_R8G8_UNORM;
1178 case VK_FORMAT_R8G8B8_SNORM:
1179 return VK_FORMAT_R8G8B8_UNORM;
1180 case VK_FORMAT_B8G8R8_SNORM:
1181 return VK_FORMAT_B8G8R8_UNORM;
1182 case VK_FORMAT_R8G8B8A8_SNORM:
1183 return VK_FORMAT_R8G8B8A8_UNORM;
1184 case VK_FORMAT_B8G8R8A8_SNORM:
1185 return VK_FORMAT_B8G8R8A8_UNORM;
1186 case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
1187 return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
1188 case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
1189 return VK_FORMAT_A2R10G10B10_UNORM_PACK32;
1190 case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
1191 return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
1192 case VK_FORMAT_R16_SNORM:
1193 return VK_FORMAT_R16_UNORM;
1194 case VK_FORMAT_R16G16_SNORM:
1195 return VK_FORMAT_R16G16_UNORM;
1196 case VK_FORMAT_R16G16B16_SNORM:
1197 return VK_FORMAT_R16G16B16_UNORM;
1198 case VK_FORMAT_R16G16B16A16_SNORM:
1199 return VK_FORMAT_R16G16B16A16_UNORM;
1200
1201 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
1202 return VK_FORMAT_R32_UINT;
1203
1204 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
1205 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
1206 return VK_FORMAT_R8G8_UNORM;
1207 else
1208 return VK_FORMAT_R8_UNORM;
1209 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
1210 return VK_FORMAT_R8_UNORM;
1211
1212 case VK_FORMAT_D24_UNORM_S8_UINT:
1213 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
1214 return VK_FORMAT_R8_UNORM;
1215 else
1216 return format;
1217
1218 case VK_FORMAT_D32_SFLOAT_S8_UINT:
1219 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1220 return VK_FORMAT_S8_UINT;
1221 assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
1222 return VK_FORMAT_D32_SFLOAT;
1223
1224 default:
1225 return format;
1226 }
1227 }
1228
1229 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)1230 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
1231 struct tu_cs *cs,
1232 struct tu_image *image,
1233 const VkClearValue *value)
1234 {
1235 const struct blit_ops *ops = &r2d_ops;
1236
1237 ops->setup(cmd, cs, VK_FORMAT_D16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
1238 VK_SAMPLE_COUNT_1_BIT);
1239 ops->clear_value(cs, VK_FORMAT_D16_UNORM, value);
1240 ops->dst_buffer(cs, VK_FORMAT_D16_UNORM,
1241 image->bo->iova + image->bo_offset + image->lrz_offset,
1242 image->lrz_pitch * 2);
1243 ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
1244 ops->run(cmd, cs);
1245 ops->teardown(cmd, cs);
1246 }
1247
1248 static void
tu_image_view_copy_blit(struct tu_image_view * iview,struct tu_image * image,VkFormat format,const VkImageSubresourceLayers * subres,uint32_t layer,bool stencil_read,bool z_scale)1249 tu_image_view_copy_blit(struct tu_image_view *iview,
1250 struct tu_image *image,
1251 VkFormat format,
1252 const VkImageSubresourceLayers *subres,
1253 uint32_t layer,
1254 bool stencil_read,
1255 bool z_scale)
1256 {
1257 VkImageAspectFlags aspect_mask = subres->aspectMask;
1258
1259 /* always use the AS_R8G8B8A8 format for these */
1260 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
1261 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1262 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1263 }
1264
1265 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
1266 .image = tu_image_to_handle(image),
1267 .viewType = z_scale ? VK_IMAGE_VIEW_TYPE_3D : VK_IMAGE_VIEW_TYPE_2D,
1268 .format = format,
1269 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
1270 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
1271 .subresourceRange = {
1272 .aspectMask = aspect_mask,
1273 .baseMipLevel = subres->mipLevel,
1274 .levelCount = 1,
1275 .baseArrayLayer = subres->baseArrayLayer + layer,
1276 .layerCount = 1,
1277 },
1278 }, false);
1279 }
1280
1281 static void
tu_image_view_copy(struct tu_image_view * iview,struct tu_image * image,VkFormat format,const VkImageSubresourceLayers * subres,uint32_t layer,bool stencil_read)1282 tu_image_view_copy(struct tu_image_view *iview,
1283 struct tu_image *image,
1284 VkFormat format,
1285 const VkImageSubresourceLayers *subres,
1286 uint32_t layer,
1287 bool stencil_read)
1288 {
1289 format = copy_format(format, subres->aspectMask, false);
1290 tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read, false);
1291 }
1292
1293 static void
tu_image_view_blit(struct tu_image_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)1294 tu_image_view_blit(struct tu_image_view *iview,
1295 struct tu_image *image,
1296 const VkImageSubresourceLayers *subres,
1297 uint32_t layer)
1298 {
1299 tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false, false);
1300 }
1301
1302 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit * info,VkFilter filter)1303 tu6_blit_image(struct tu_cmd_buffer *cmd,
1304 struct tu_image *src_image,
1305 struct tu_image *dst_image,
1306 const VkImageBlit *info,
1307 VkFilter filter)
1308 {
1309 const struct blit_ops *ops = &r2d_ops;
1310 struct tu_cs *cs = &cmd->cs;
1311 bool z_scale = false;
1312 uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
1313
1314 /* 2D blit can't do rotation mirroring from just coordinates */
1315 static const enum a6xx_rotation rotate[2][2] = {
1316 {ROTATE_0, ROTATE_HFLIP},
1317 {ROTATE_VFLIP, ROTATE_180},
1318 };
1319
1320 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1321 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1322 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1323 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1324
1325 int32_t src0_z = info->srcOffsets[0].z;
1326 int32_t src1_z = info->srcOffsets[1].z;
1327
1328 if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
1329 info->dstOffsets[1].z - info->dstOffsets[0].z) ||
1330 info->srcOffsets[1].z < info->srcOffsets[0].z) {
1331 z_scale = true;
1332 }
1333
1334 if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
1335 layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
1336 src0_z = info->srcOffsets[1].z;
1337 src1_z = info->srcOffsets[0].z;
1338 }
1339
1340 if (info->dstSubresource.layerCount > 1) {
1341 assert(layers <= 1);
1342 layers = info->dstSubresource.layerCount;
1343 }
1344
1345 /* BC1_RGB_* formats need to have their last components overriden with 1
1346 * when sampling, which is normally handled with the texture descriptor
1347 * swizzle. The 2d path can't handle that, so use the 3d path.
1348 *
1349 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1350 * the 2d path.
1351 */
1352
1353 unsigned blit_param = rotate[mirror_y][mirror_x];
1354 if (dst_image->layout[0].nr_samples > 1 ||
1355 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1356 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1357 filter == VK_FILTER_CUBIC_EXT ||
1358 z_scale) {
1359 ops = &r3d_ops;
1360 blit_param = z_scale;
1361 }
1362
1363 /* use the right format in setup() for D32_S8
1364 * TODO: this probably should use a helper
1365 */
1366 VkFormat format = dst_image->vk_format;
1367 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1368 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1369 format = VK_FORMAT_D32_SFLOAT;
1370 else if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1371 format = VK_FORMAT_S8_UINT;
1372 else
1373 unreachable("unexpected D32_S8 aspect mask in blit_image");
1374 }
1375
1376 trace_start_blit(&cmd->trace, cs);
1377
1378 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1379 blit_param, false, dst_image->layout[0].ubwc,
1380 dst_image->layout[0].nr_samples);
1381
1382 if (ops == &r3d_ops) {
1383 r3d_coords_raw(cs, (float[]) {
1384 info->dstOffsets[0].x, info->dstOffsets[0].y,
1385 info->srcOffsets[0].x, info->srcOffsets[0].y,
1386 info->dstOffsets[1].x, info->dstOffsets[1].y,
1387 info->srcOffsets[1].x, info->srcOffsets[1].y
1388 });
1389 } else {
1390 tu_cs_emit_regs(cs,
1391 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1392 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1393 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1394 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1395 tu_cs_emit_regs(cs,
1396 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1397 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1398 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1399 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1400 }
1401
1402 struct tu_image_view dst, src;
1403 tu_image_view_blit(&dst, dst_image, &info->dstSubresource,
1404 MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
1405
1406 if (z_scale) {
1407 tu_image_view_copy_blit(&src, src_image, src_image->vk_format,
1408 &info->srcSubresource, 0, false, true);
1409 ops->src(cmd, cs, &src, 0, filter);
1410 } else {
1411 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1412 }
1413
1414 for (uint32_t i = 0; i < layers; i++) {
1415 if (z_scale) {
1416 float t = ((float) i + 0.5f) / (float) layers;
1417 r3d_coord_z(cs, t * (src1_z - src0_z) + src0_z);
1418 } else {
1419 ops->src(cmd, cs, &src, i, filter);
1420 }
1421 ops->dst(cs, &dst, i);
1422 ops->run(cmd, cs);
1423 }
1424
1425 ops->teardown(cmd, cs);
1426
1427 trace_end_blit(&cmd->trace, cs,
1428 ops == &r3d_ops,
1429 src_image->vk_format,
1430 dst_image->vk_format,
1431 layers);
1432 }
1433
1434 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageBlit * pRegions,VkFilter filter)1435 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1436 VkImage srcImage,
1437 VkImageLayout srcImageLayout,
1438 VkImage dstImage,
1439 VkImageLayout dstImageLayout,
1440 uint32_t regionCount,
1441 const VkImageBlit *pRegions,
1442 VkFilter filter)
1443
1444 {
1445 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1446 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1447 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1448
1449 for (uint32_t i = 0; i < regionCount; ++i) {
1450 /* can't blit both depth and stencil at once with D32_S8
1451 * TODO: more advanced 3D blit path to support it instead?
1452 */
1453 if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1454 dst_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1455 VkImageBlit region = pRegions[i];
1456 u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) {
1457 region.srcSubresource.aspectMask = BIT(b);
1458 region.dstSubresource.aspectMask = BIT(b);
1459 tu6_blit_image(cmd, src_image, dst_image, ®ion, filter);
1460 }
1461 continue;
1462 }
1463 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1464 }
1465 }
1466
1467 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)1468 copy_compressed(VkFormat format,
1469 VkOffset3D *offset,
1470 VkExtent3D *extent,
1471 uint32_t *width,
1472 uint32_t *height)
1473 {
1474 if (!vk_format_is_compressed(format))
1475 return;
1476
1477 uint32_t block_width = vk_format_get_blockwidth(format);
1478 uint32_t block_height = vk_format_get_blockheight(format);
1479
1480 offset->x /= block_width;
1481 offset->y /= block_height;
1482
1483 if (extent) {
1484 extent->width = DIV_ROUND_UP(extent->width, block_width);
1485 extent->height = DIV_ROUND_UP(extent->height, block_height);
1486 }
1487 if (width)
1488 *width = DIV_ROUND_UP(*width, block_width);
1489 if (height)
1490 *height = DIV_ROUND_UP(*height, block_height);
1491 }
1492
1493 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy * info)1494 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1495 struct tu_buffer *src_buffer,
1496 struct tu_image *dst_image,
1497 const VkBufferImageCopy *info)
1498 {
1499 struct tu_cs *cs = &cmd->cs;
1500 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1501 VkFormat src_format =
1502 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1503 const struct blit_ops *ops = &r2d_ops;
1504
1505 /* special case for buffer to stencil */
1506 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1507 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1508 ops = &r3d_ops;
1509 }
1510
1511 /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1512 * which matters for UBWC. buffer_to_image/etc can fail because of this
1513 */
1514
1515 VkOffset3D offset = info->imageOffset;
1516 VkExtent3D extent = info->imageExtent;
1517 uint32_t src_width = info->bufferRowLength ?: extent.width;
1518 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1519
1520 copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1521
1522 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1523 uint32_t layer_size = src_height * pitch;
1524
1525 ops->setup(cmd, cs,
1526 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1527 info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc,
1528 dst_image->layout[0].nr_samples);
1529
1530 struct tu_image_view dst;
1531 tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1532
1533 for (uint32_t i = 0; i < layers; i++) {
1534 ops->dst(cs, &dst, i);
1535
1536 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1537 if ((src_va & 63) || (pitch & 63)) {
1538 for (uint32_t y = 0; y < extent.height; y++) {
1539 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1540 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1541 x + extent.width, 1);
1542 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1543 &(VkExtent2D) {extent.width, 1});
1544 ops->run(cmd, cs);
1545 src_va += pitch;
1546 }
1547 } else {
1548 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1549 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1550 ops->run(cmd, cs);
1551 }
1552 }
1553
1554 ops->teardown(cmd, cs);
1555 }
1556
1557 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,VkBuffer srcBuffer,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkBufferImageCopy * pRegions)1558 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1559 VkBuffer srcBuffer,
1560 VkImage dstImage,
1561 VkImageLayout dstImageLayout,
1562 uint32_t regionCount,
1563 const VkBufferImageCopy *pRegions)
1564 {
1565 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1566 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1567 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1568
1569 for (unsigned i = 0; i < regionCount; ++i)
1570 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1571 }
1572
1573 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy * info)1574 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1575 struct tu_image *src_image,
1576 struct tu_buffer *dst_buffer,
1577 const VkBufferImageCopy *info)
1578 {
1579 struct tu_cs *cs = &cmd->cs;
1580 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1581 VkFormat dst_format =
1582 copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1583 bool stencil_read = false;
1584
1585 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1586 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1587 stencil_read = true;
1588 }
1589
1590 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1591 VkOffset3D offset = info->imageOffset;
1592 VkExtent3D extent = info->imageExtent;
1593 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1594 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1595
1596 copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1597
1598 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1599 uint32_t layer_size = pitch * dst_height;
1600
1601 ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1602 VK_SAMPLE_COUNT_1_BIT);
1603
1604 struct tu_image_view src;
1605 tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1606
1607 for (uint32_t i = 0; i < layers; i++) {
1608 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1609
1610 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1611 if ((dst_va & 63) || (pitch & 63)) {
1612 for (uint32_t y = 0; y < extent.height; y++) {
1613 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1614 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1615 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1616 &(VkExtent2D) {extent.width, 1});
1617 ops->run(cmd, cs);
1618 dst_va += pitch;
1619 }
1620 } else {
1621 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1622 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1623 ops->run(cmd, cs);
1624 }
1625 }
1626
1627 ops->teardown(cmd, cs);
1628 }
1629
1630 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkBuffer dstBuffer,uint32_t regionCount,const VkBufferImageCopy * pRegions)1631 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1632 VkImage srcImage,
1633 VkImageLayout srcImageLayout,
1634 VkBuffer dstBuffer,
1635 uint32_t regionCount,
1636 const VkBufferImageCopy *pRegions)
1637 {
1638 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1639 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1640 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1641
1642 for (unsigned i = 0; i < regionCount; ++i)
1643 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1644 }
1645
1646 /* Tiled formats don't support swapping, which means that we can't support
1647 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1648 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1649 * Currently we fake support for tiled swapped formats and use the unswapped
1650 * format instead, but this means that reinterpreting copies to and from
1651 * swapped formats can't be performed correctly unless we can swizzle the
1652 * components by reinterpreting the other image as the "correct" swapped
1653 * format, i.e. only when the other image is linear.
1654 */
1655
1656 static bool
is_swapped_format(VkFormat format)1657 is_swapped_format(VkFormat format)
1658 {
1659 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1660 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1661 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1662 }
1663
1664 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1665 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1666 * versa). This should mirror the logic in fdl6_layout.
1667 */
1668 static bool
image_is_r8g8(struct tu_image * image)1669 image_is_r8g8(struct tu_image *image)
1670 {
1671 return image->layout[0].cpp == 2 &&
1672 vk_format_get_nr_components(image->vk_format) == 2;
1673 }
1674
1675 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy * info)1676 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1677 struct tu_image *src_image,
1678 struct tu_image *dst_image,
1679 const VkImageCopy *info)
1680 {
1681 const struct blit_ops *ops = &r2d_ops;
1682 struct tu_cs *cs = &cmd->cs;
1683
1684 if (dst_image->layout[0].nr_samples > 1)
1685 ops = &r3d_ops;
1686
1687 VkFormat format = VK_FORMAT_UNDEFINED;
1688 VkOffset3D src_offset = info->srcOffset;
1689 VkOffset3D dst_offset = info->dstOffset;
1690 VkExtent3D extent = info->extent;
1691 uint32_t layers_to_copy = MAX2(info->extent.depth, info->srcSubresource.layerCount);
1692
1693 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1694 * Images":
1695 *
1696 * When copying between compressed and uncompressed formats the extent
1697 * members represent the texel dimensions of the source image and not
1698 * the destination. When copying from a compressed image to an
1699 * uncompressed image the image texel dimensions written to the
1700 * uncompressed image will be source extent divided by the compressed
1701 * texel block dimensions. When copying from an uncompressed image to a
1702 * compressed image the image texel dimensions written to the compressed
1703 * image will be the source extent multiplied by the compressed texel
1704 * block dimensions.
1705 *
1706 * This means we only have to adjust the extent if the source image is
1707 * compressed.
1708 */
1709 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1710 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1711
1712 VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1713 VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1714
1715 bool use_staging_blit = false;
1716
1717 if (src_format == dst_format) {
1718 /* Images that share a format can always be copied directly because it's
1719 * the same as a blit.
1720 */
1721 format = src_format;
1722 } else if (!src_image->layout[0].tile_mode) {
1723 /* If an image is linear, we can always safely reinterpret it with the
1724 * other image's format and then do a regular blit.
1725 */
1726 format = dst_format;
1727 } else if (!dst_image->layout[0].tile_mode) {
1728 format = src_format;
1729 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1730 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1731 * due to the different tile layout.
1732 */
1733 use_staging_blit = true;
1734 } else if (is_swapped_format(src_format) ||
1735 is_swapped_format(dst_format)) {
1736 /* If either format has a non-identity swap, then we can't copy
1737 * to/from it.
1738 */
1739 use_staging_blit = true;
1740 } else if (!src_image->layout[0].ubwc) {
1741 format = dst_format;
1742 } else if (!dst_image->layout[0].ubwc) {
1743 format = src_format;
1744 } else {
1745 /* Both formats use UBWC and so neither can be reinterpreted.
1746 * TODO: We could do an in-place decompression of the dst instead.
1747 */
1748 use_staging_blit = true;
1749 }
1750
1751 struct tu_image_view dst, src;
1752
1753 if (use_staging_blit) {
1754 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1755 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1756
1757 struct tu_image staging_image = {
1758 .base.type = VK_OBJECT_TYPE_IMAGE,
1759 .vk_format = src_format,
1760 .level_count = 1,
1761 .layer_count = info->srcSubresource.layerCount,
1762 .bo_offset = 0,
1763 };
1764
1765 VkImageSubresourceLayers staging_subresource = {
1766 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1767 .mipLevel = 0,
1768 .baseArrayLayer = 0,
1769 .layerCount = info->srcSubresource.layerCount,
1770 };
1771
1772 VkOffset3D staging_offset = { 0 };
1773
1774 staging_image.layout[0].tile_mode = TILE6_LINEAR;
1775 staging_image.layout[0].ubwc = false;
1776
1777 fdl6_layout(&staging_image.layout[0],
1778 vk_format_to_pipe_format(staging_image.vk_format),
1779 src_image->layout[0].nr_samples,
1780 extent.width,
1781 extent.height,
1782 extent.depth,
1783 staging_image.level_count,
1784 staging_image.layer_count,
1785 extent.depth > 1,
1786 NULL);
1787
1788 VkResult result = tu_get_scratch_bo(cmd->device,
1789 staging_image.layout[0].size,
1790 &staging_image.bo);
1791 if (result != VK_SUCCESS) {
1792 cmd->record_result = result;
1793 return;
1794 }
1795
1796 struct tu_image_view staging;
1797 tu_image_view_copy(&staging, &staging_image, src_format,
1798 &staging_subresource, 0, false);
1799
1800 ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1801 dst_image->layout[0].nr_samples);
1802 coords(ops, cs, &staging_offset, &src_offset, &extent);
1803
1804 for (uint32_t i = 0; i < layers_to_copy; i++) {
1805 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1806 ops->dst(cs, &staging, i);
1807 ops->run(cmd, cs);
1808 }
1809
1810 /* When executed by the user there has to be a pipeline barrier here,
1811 * but since we're doing it manually we'll have to flush ourselves.
1812 */
1813 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1814 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1815 tu_cs_emit_wfi(cs);
1816
1817 tu_image_view_copy(&staging, &staging_image, dst_format,
1818 &staging_subresource, 0, false);
1819
1820 ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
1821 0, false, dst_image->layout[0].ubwc,
1822 dst_image->layout[0].nr_samples);
1823 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1824
1825 for (uint32_t i = 0; i < layers_to_copy; i++) {
1826 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1827 ops->dst(cs, &dst, i);
1828 ops->run(cmd, cs);
1829 }
1830 } else {
1831 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1832 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1833
1834 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1835 0, false, dst_image->layout[0].ubwc,
1836 dst_image->layout[0].nr_samples);
1837 coords(ops, cs, &dst_offset, &src_offset, &extent);
1838
1839 for (uint32_t i = 0; i < layers_to_copy; i++) {
1840 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1841 ops->dst(cs, &dst, i);
1842 ops->run(cmd, cs);
1843 }
1844 }
1845
1846 ops->teardown(cmd, cs);
1847 }
1848
1849 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage destImage,VkImageLayout destImageLayout,uint32_t regionCount,const VkImageCopy * pRegions)1850 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1851 VkImage srcImage,
1852 VkImageLayout srcImageLayout,
1853 VkImage destImage,
1854 VkImageLayout destImageLayout,
1855 uint32_t regionCount,
1856 const VkImageCopy *pRegions)
1857 {
1858 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1859 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1860 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1861
1862 for (uint32_t i = 0; i < regionCount; ++i) {
1863 if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1864 VkImageCopy info = pRegions[i];
1865 u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) {
1866 info.srcSubresource.aspectMask = BIT(b);
1867 info.dstSubresource.aspectMask = BIT(b);
1868 tu_copy_image_to_image(cmd, src_image, dst_image, &info);
1869 }
1870 continue;
1871 }
1872
1873 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1874 }
1875 }
1876
1877 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size)1878 copy_buffer(struct tu_cmd_buffer *cmd,
1879 uint64_t dst_va,
1880 uint64_t src_va,
1881 uint64_t size,
1882 uint32_t block_size)
1883 {
1884 const struct blit_ops *ops = &r2d_ops;
1885 struct tu_cs *cs = &cmd->cs;
1886 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1887 uint64_t blocks = size / block_size;
1888
1889 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1890 VK_SAMPLE_COUNT_1_BIT);
1891
1892 while (blocks) {
1893 uint32_t src_x = (src_va & 63) / block_size;
1894 uint32_t dst_x = (dst_va & 63) / block_size;
1895 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1896
1897 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1898 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1899 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1900 ops->run(cmd, cs);
1901
1902 src_va += width * block_size;
1903 dst_va += width * block_size;
1904 blocks -= width;
1905 }
1906
1907 ops->teardown(cmd, cs);
1908 }
1909
1910 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,VkBuffer srcBuffer,VkBuffer dstBuffer,uint32_t regionCount,const VkBufferCopy * pRegions)1911 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1912 VkBuffer srcBuffer,
1913 VkBuffer dstBuffer,
1914 uint32_t regionCount,
1915 const VkBufferCopy *pRegions)
1916 {
1917 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1918 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1919 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1920
1921 for (unsigned i = 0; i < regionCount; ++i) {
1922 copy_buffer(cmd,
1923 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1924 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1925 pRegions[i].size, 1);
1926 }
1927 }
1928
1929 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)1930 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1931 VkBuffer dstBuffer,
1932 VkDeviceSize dstOffset,
1933 VkDeviceSize dataSize,
1934 const void *pData)
1935 {
1936 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1937 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1938
1939 struct tu_cs_memory tmp;
1940 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
1941 if (result != VK_SUCCESS) {
1942 cmd->record_result = result;
1943 return;
1944 }
1945
1946 memcpy(tmp.map, pData, dataSize);
1947 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1948 }
1949
1950 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)1951 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1952 VkBuffer dstBuffer,
1953 VkDeviceSize dstOffset,
1954 VkDeviceSize fillSize,
1955 uint32_t data)
1956 {
1957 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1958 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1959 const struct blit_ops *ops = &r2d_ops;
1960 struct tu_cs *cs = &cmd->cs;
1961
1962 if (fillSize == VK_WHOLE_SIZE)
1963 fillSize = buffer->size - dstOffset;
1964
1965 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1966 uint32_t blocks = fillSize / 4;
1967
1968 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
1969 VK_SAMPLE_COUNT_1_BIT);
1970 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1971
1972 while (blocks) {
1973 uint32_t dst_x = (dst_va & 63) / 4;
1974 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1975
1976 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1977 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1978 ops->run(cmd, cs);
1979
1980 dst_va += width * 4;
1981 blocks -= width;
1982 }
1983
1984 ops->teardown(cmd, cs);
1985 }
1986
1987 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageResolve * pRegions)1988 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1989 VkImage srcImage,
1990 VkImageLayout srcImageLayout,
1991 VkImage dstImage,
1992 VkImageLayout dstImageLayout,
1993 uint32_t regionCount,
1994 const VkImageResolve *pRegions)
1995 {
1996 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1997 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1998 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1999 const struct blit_ops *ops = &r2d_ops;
2000 struct tu_cs *cs = &cmd->cs;
2001
2002 ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
2003 0, false, dst_image->layout[0].ubwc, VK_SAMPLE_COUNT_1_BIT);
2004
2005 for (uint32_t i = 0; i < regionCount; ++i) {
2006 const VkImageResolve *info = &pRegions[i];
2007 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
2008
2009 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
2010 /* TODO: aspect masks possible ? */
2011
2012 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
2013
2014 struct tu_image_view dst, src;
2015 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
2016 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
2017
2018 for (uint32_t i = 0; i < layers; i++) {
2019 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
2020 ops->dst(cs, &dst, i);
2021 ops->run(cmd, cs);
2022 }
2023 }
2024
2025 ops->teardown(cmd, cs);
2026 }
2027
2028 #define for_each_layer(layer, layer_mask, layers) \
2029 for (uint32_t layer = 0; \
2030 layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
2031 layer++) \
2032 if (!layer_mask || (layer_mask & BIT(layer)))
2033
2034 static void
resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat format,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect,bool separate_stencil)2035 resolve_sysmem(struct tu_cmd_buffer *cmd,
2036 struct tu_cs *cs,
2037 VkFormat format,
2038 const struct tu_image_view *src,
2039 const struct tu_image_view *dst,
2040 uint32_t layer_mask,
2041 uint32_t layers,
2042 const VkRect2D *rect,
2043 bool separate_stencil)
2044 {
2045 const struct blit_ops *ops = &r2d_ops;
2046
2047 trace_start_sysmem_resolve(&cmd->trace, cs);
2048
2049 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT,
2050 0, false, dst->ubwc_enabled, VK_SAMPLE_COUNT_1_BIT);
2051 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
2052
2053 for_each_layer(i, layer_mask, layers) {
2054 if (separate_stencil) {
2055 r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);
2056 r2d_dst_stencil(cs, dst, i);
2057 } else {
2058 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
2059 ops->dst(cs, dst, i);
2060 }
2061 ops->run(cmd, cs);
2062 }
2063
2064 ops->teardown(cmd, cs);
2065
2066 trace_end_sysmem_resolve(&cmd->trace, cs, format);
2067 }
2068
2069 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)2070 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
2071 struct tu_cs *cs,
2072 const struct tu_image_view *src,
2073 const struct tu_image_view *dst,
2074 uint32_t layer_mask,
2075 uint32_t layers,
2076 const VkRect2D *rect)
2077 {
2078 assert(src->image->vk_format == dst->image->vk_format);
2079
2080 if (dst->image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2081 resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT,
2082 src, dst, layer_mask, layers, rect, false);
2083 resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT,
2084 src, dst, layer_mask, layers, rect, true);
2085 } else {
2086 resolve_sysmem(cmd, cs, dst->image->vk_format,
2087 src, dst, layer_mask, layers, rect, false);
2088 }
2089 }
2090
2091 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)2092 clear_image(struct tu_cmd_buffer *cmd,
2093 struct tu_image *image,
2094 const VkClearValue *clear_value,
2095 const VkImageSubresourceRange *range,
2096 VkImageAspectFlags aspect_mask)
2097 {
2098 uint32_t level_count = tu_get_levelCount(image, range);
2099 uint32_t layer_count = tu_get_layerCount(image, range);
2100 struct tu_cs *cs = &cmd->cs;
2101 VkFormat format = image->vk_format;
2102 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2103 format = copy_format(format, aspect_mask, false);
2104
2105 if (image->layout[0].depth0 > 1) {
2106 assert(layer_count == 1);
2107 assert(range->baseArrayLayer == 0);
2108 }
2109
2110 const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops;
2111
2112 ops->setup(cmd, cs, format, aspect_mask, 0, true, image->layout[0].ubwc,
2113 image->layout[0].nr_samples);
2114 if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2115 ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
2116 else
2117 ops->clear_value(cs, format, clear_value);
2118
2119 for (unsigned j = 0; j < level_count; j++) {
2120 if (image->layout[0].depth0 > 1)
2121 layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
2122
2123 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
2124 u_minify(image->layout[0].width0, range->baseMipLevel + j),
2125 u_minify(image->layout[0].height0, range->baseMipLevel + j)
2126 });
2127
2128 struct tu_image_view dst;
2129 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
2130 .aspectMask = aspect_mask,
2131 .mipLevel = range->baseMipLevel + j,
2132 .baseArrayLayer = range->baseArrayLayer,
2133 .layerCount = 1,
2134 }, 0, false, false);
2135
2136 for (uint32_t i = 0; i < layer_count; i++) {
2137 ops->dst(cs, &dst, i);
2138 ops->run(cmd, cs);
2139 }
2140 }
2141
2142 ops->teardown(cmd, cs);
2143 }
2144
2145 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)2146 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
2147 VkImage image_h,
2148 VkImageLayout imageLayout,
2149 const VkClearColorValue *pColor,
2150 uint32_t rangeCount,
2151 const VkImageSubresourceRange *pRanges)
2152 {
2153 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2154 TU_FROM_HANDLE(tu_image, image, image_h);
2155
2156 for (unsigned i = 0; i < rangeCount; i++)
2157 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
2158 }
2159
2160 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)2161 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
2162 VkImage image_h,
2163 VkImageLayout imageLayout,
2164 const VkClearDepthStencilValue *pDepthStencil,
2165 uint32_t rangeCount,
2166 const VkImageSubresourceRange *pRanges)
2167 {
2168 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2169 TU_FROM_HANDLE(tu_image, image, image_h);
2170
2171 for (unsigned i = 0; i < rangeCount; i++) {
2172 const VkImageSubresourceRange *range = &pRanges[i];
2173
2174 if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2175 /* can't clear both depth and stencil at once, split up the aspect mask */
2176 u_foreach_bit(b, range->aspectMask)
2177 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
2178 continue;
2179 }
2180
2181 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
2182 }
2183 }
2184
2185 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)2186 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2187 uint32_t attachment_count,
2188 const VkClearAttachment *attachments,
2189 uint32_t rect_count,
2190 const VkClearRect *rects)
2191 {
2192 /* the shader path here is special, it avoids changing MRT/etc state */
2193 const struct tu_subpass *subpass = cmd->state.subpass;
2194 const uint32_t mrt_count = subpass->color_count;
2195 struct tu_cs *cs = &cmd->draw_cs;
2196 uint32_t clear_value[MAX_RTS][4];
2197 float z_clear_val = 0.0f;
2198 uint8_t s_clear_val = 0;
2199 uint32_t clear_rts = 0, clear_components = 0;
2200 bool z_clear = false;
2201 bool s_clear = false;
2202
2203 trace_start_sysmem_clear_all(&cmd->trace, cs);
2204
2205 for (uint32_t i = 0; i < attachment_count; i++) {
2206 uint32_t a;
2207 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2208 uint32_t c = attachments[i].colorAttachment;
2209 a = subpass->color_attachments[c].attachment;
2210 if (a == VK_ATTACHMENT_UNUSED)
2211 continue;
2212
2213 clear_rts |= 1 << c;
2214 clear_components |= 0xf << (c * 4);
2215 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2216 } else {
2217 a = subpass->depth_stencil_attachment.attachment;
2218 if (a == VK_ATTACHMENT_UNUSED)
2219 continue;
2220
2221 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2222 z_clear = true;
2223 z_clear_val = attachments[i].clearValue.depthStencil.depth;
2224 }
2225
2226 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2227 s_clear = true;
2228 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2229 }
2230 }
2231 }
2232
2233 /* We may not know the multisample count if there are no attachments, so
2234 * just bail early to avoid corner cases later.
2235 */
2236 if (clear_rts == 0 && !z_clear && !s_clear)
2237 return;
2238
2239 /* disable all draw states so they don't interfere
2240 * TODO: use and re-use draw states
2241 * we have to disable draw states individually to preserve
2242 * input attachment states, because a secondary command buffer
2243 * won't be able to restore them
2244 */
2245 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
2246 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
2247 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
2248 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
2249 continue;
2250 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
2251 CP_SET_DRAW_STATE__0_DISABLE);
2252 tu_cs_emit_qw(cs, 0);
2253 }
2254 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2255
2256 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2257 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2258 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2259 0xfc000000);
2260 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2261
2262 r3d_common(cmd, cs, false, clear_rts, false, cmd->state.subpass->samples);
2263
2264 tu_cs_emit_regs(cs,
2265 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2266 tu_cs_emit_regs(cs,
2267 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2268
2269 tu_cs_emit_regs(cs,
2270 A6XX_RB_FS_OUTPUT_CNTL0(),
2271 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2272
2273 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2274 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2275 for (uint32_t i = 0; i < mrt_count; i++) {
2276 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2277 .component_enable = COND(clear_rts & (1 << i), 0xf)));
2278 }
2279
2280 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
2281 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
2282
2283 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2284 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2285 .z_test_enable = z_clear,
2286 .z_write_enable = z_clear,
2287 .zfunc = FUNC_ALWAYS));
2288 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2289 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2290 .stencil_enable = s_clear,
2291 .func = FUNC_ALWAYS,
2292 .zpass = STENCIL_REPLACE));
2293 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2294 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2295 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2296
2297 unsigned num_rts = util_bitcount(clear_rts);
2298 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2299 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2300 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2301 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2302 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2303 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2304 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2305 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2306 u_foreach_bit(b, clear_rts)
2307 tu_cs_emit_array(cs, clear_value[b], 4);
2308
2309 for (uint32_t i = 0; i < rect_count; i++) {
2310 /* This should be true because of this valid usage for
2311 * vkCmdClearAttachments:
2312 *
2313 * "If the render pass instance this is recorded in uses multiview,
2314 * then baseArrayLayer must be zero and layerCount must be one"
2315 */
2316 assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
2317
2318 /* a630 doesn't support multiview masks, which means that we can't use
2319 * the normal multiview path without potentially recompiling a shader
2320 * on-demand or using a more complicated variant that takes the mask as
2321 * a const. Just use the layered path instead, since it shouldn't be
2322 * much worse.
2323 */
2324 for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {
2325 r3d_coords_raw(cs, (float[]) {
2326 rects[i].rect.offset.x, rects[i].rect.offset.y,
2327 z_clear_val, uif(rects[i].baseArrayLayer + layer),
2328 rects[i].rect.offset.x + rects[i].rect.extent.width,
2329 rects[i].rect.offset.y + rects[i].rect.extent.height,
2330 z_clear_val, 1.0f,
2331 });
2332 r3d_run(cmd, cs);
2333 }
2334 }
2335
2336 trace_end_sysmem_clear_all(&cmd->trace,
2337 cs, mrt_count, rect_count);
2338 }
2339
2340 static void
pack_gmem_clear_value(const VkClearValue * val,VkFormat format,uint32_t clear_value[4])2341 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
2342 {
2343 switch (format) {
2344 case VK_FORMAT_X8_D24_UNORM_PACK32:
2345 case VK_FORMAT_D24_UNORM_S8_UINT:
2346 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2347 val->depthStencil.stencil << 24;
2348 return;
2349 case VK_FORMAT_D16_UNORM:
2350 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2351 return;
2352 case VK_FORMAT_D32_SFLOAT:
2353 clear_value[0] = fui(val->depthStencil.depth);
2354 return;
2355 case VK_FORMAT_S8_UINT:
2356 clear_value[0] = val->depthStencil.stencil;
2357 return;
2358 default:
2359 break;
2360 }
2361
2362 float tmp[4];
2363 memcpy(tmp, val->color.float32, 4 * sizeof(float));
2364 if (vk_format_is_srgb(format)) {
2365 for (int i = 0; i < 3; i++)
2366 tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
2367 }
2368
2369 #define PACK_F(type) util_format_##type##_pack_rgba_float \
2370 ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
2371 switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
2372 case 4:
2373 PACK_F(r4g4b4a4_unorm);
2374 break;
2375 case 5:
2376 if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
2377 PACK_F(r5g6b5_unorm);
2378 else
2379 PACK_F(r5g5b5a1_unorm);
2380 break;
2381 case 8:
2382 if (vk_format_is_snorm(format))
2383 PACK_F(r8g8b8a8_snorm);
2384 else if (vk_format_is_unorm(format))
2385 PACK_F(r8g8b8a8_unorm);
2386 else
2387 pack_int8(clear_value, val->color.uint32);
2388 break;
2389 case 10:
2390 if (vk_format_is_int(format))
2391 pack_int10_2(clear_value, val->color.uint32);
2392 else
2393 PACK_F(r10g10b10a2_unorm);
2394 break;
2395 case 11:
2396 clear_value[0] = float3_to_r11g11b10f(val->color.float32);
2397 break;
2398 case 16:
2399 if (vk_format_is_snorm(format))
2400 PACK_F(r16g16b16a16_snorm);
2401 else if (vk_format_is_unorm(format))
2402 PACK_F(r16g16b16a16_unorm);
2403 else if (vk_format_is_float(format))
2404 PACK_F(r16g16b16a16_float);
2405 else
2406 pack_int16(clear_value, val->color.uint32);
2407 break;
2408 case 32:
2409 memcpy(clear_value, val->color.float32, 4 * sizeof(float));
2410 break;
2411 default:
2412 unreachable("unexpected channel size");
2413 }
2414 #undef PACK_F
2415 }
2416
2417 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)2418 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2419 struct tu_cs *cs,
2420 VkFormat format,
2421 uint8_t clear_mask,
2422 uint32_t gmem_offset,
2423 const VkClearValue *value)
2424 {
2425 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2426 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
2427
2428 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
2429
2430 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2431 tu_cs_emit(cs, gmem_offset);
2432
2433 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2434 tu_cs_emit(cs, 0);
2435
2436 uint32_t clear_vals[4] = {};
2437 pack_gmem_clear_value(value, format, clear_vals);
2438
2439 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2440 tu_cs_emit_array(cs, clear_vals, 4);
2441
2442 tu6_emit_event_write(cmd, cs, BLIT);
2443 }
2444
2445 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t attachment,VkImageAspectFlags mask,const VkClearValue * value)2446 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2447 struct tu_cs *cs,
2448 uint32_t attachment,
2449 VkImageAspectFlags mask,
2450 const VkClearValue *value)
2451 {
2452 const struct tu_render_pass_attachment *att =
2453 &cmd->state.pass->attachments[attachment];
2454
2455 trace_start_gmem_clear(&cmd->trace, cs);
2456
2457 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2458 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2459 clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
2460 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2461 clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
2462 return;
2463 }
2464
2465 clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
2466
2467 trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples);
2468 }
2469
2470 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)2471 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2472 uint32_t attachment_count,
2473 const VkClearAttachment *attachments,
2474 uint32_t rect_count,
2475 const VkClearRect *rects)
2476 {
2477 const struct tu_subpass *subpass = cmd->state.subpass;
2478 struct tu_cs *cs = &cmd->draw_cs;
2479
2480 /* TODO: swap the loops for smaller cmdstream */
2481 for (unsigned i = 0; i < rect_count; i++) {
2482 unsigned x1 = rects[i].rect.offset.x;
2483 unsigned y1 = rects[i].rect.offset.y;
2484 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2485 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2486
2487 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2488 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2489 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2490
2491 for (unsigned j = 0; j < attachment_count; j++) {
2492 uint32_t a;
2493 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2494 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2495 else
2496 a = subpass->depth_stencil_attachment.attachment;
2497
2498 if (a == VK_ATTACHMENT_UNUSED)
2499 continue;
2500
2501 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2502 &attachments[j].clearValue);
2503 }
2504 }
2505 }
2506
2507 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)2508 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2509 uint32_t attachmentCount,
2510 const VkClearAttachment *pAttachments,
2511 uint32_t rectCount,
2512 const VkClearRect *pRects)
2513 {
2514 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2515 struct tu_cs *cs = &cmd->draw_cs;
2516
2517 /* sysmem path behaves like a draw, note we don't have a way of using different
2518 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2519 */
2520 tu_emit_cache_flush_renderpass(cmd, cs);
2521
2522 for (uint32_t j = 0; j < attachmentCount; j++) {
2523 if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
2524 continue;
2525 cmd->state.lrz.valid = false;
2526 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
2527 }
2528
2529 /* vkCmdClearAttachments is supposed to respect the predicate if active.
2530 * The easiest way to do this is to always use the 3d path, which always
2531 * works even with GMEM because it's just a simple draw using the existing
2532 * attachment state. However it seems that IGNORE_VISIBILITY draws must be
2533 * skipped in the binning pass, since otherwise they produce binning data
2534 * which isn't consumed and leads to the wrong binning data being read, so
2535 * condition on GMEM | SYSMEM.
2536 */
2537 if (cmd->state.predication_active) {
2538 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |
2539 CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2540 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2541 tu_cond_exec_end(cs);
2542 return;
2543 }
2544
2545 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2546 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2547 tu_cond_exec_end(cs);
2548
2549 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2550 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2551 tu_cond_exec_end(cs);
2552 }
2553
2554 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat format,VkImageAspectFlags clear_mask,const VkRenderPassBeginInfo * info,uint32_t a,bool separate_stencil)2555 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2556 struct tu_cs *cs,
2557 VkFormat format,
2558 VkImageAspectFlags clear_mask,
2559 const VkRenderPassBeginInfo *info,
2560 uint32_t a,
2561 bool separate_stencil)
2562 {
2563 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2564 const struct tu_image_view *iview = cmd->state.attachments[a];
2565 const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
2566 const struct blit_ops *ops = &r2d_ops;
2567 if (cmd->state.pass->attachments[a].samples > 1)
2568 ops = &r3d_ops;
2569
2570 trace_start_sysmem_clear(&cmd->trace, cs);
2571
2572 ops->setup(cmd, cs, format, clear_mask, 0, true, iview->ubwc_enabled,
2573 cmd->state.pass->attachments[a].samples);
2574 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2575 ops->clear_value(cs, format, &info->pClearValues[a]);
2576
2577 for_each_layer(i, clear_views, fb->layers) {
2578 if (separate_stencil) {
2579 if (ops == &r3d_ops)
2580 r3d_dst_stencil(cs, iview, i);
2581 else
2582 r2d_dst_stencil(cs, iview, i);
2583 } else {
2584 ops->dst(cs, iview, i);
2585 }
2586 ops->run(cmd, cs);
2587 }
2588
2589 ops->teardown(cmd, cs);
2590
2591 trace_end_sysmem_clear(&cmd->trace, cs,
2592 format, ops == &r3d_ops,
2593 cmd->state.pass->attachments[a].samples);
2594 }
2595
2596 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,const VkRenderPassBeginInfo * info)2597 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2598 struct tu_cs *cs,
2599 uint32_t a,
2600 const VkRenderPassBeginInfo *info)
2601 {
2602 const struct tu_render_pass_attachment *attachment =
2603 &cmd->state.pass->attachments[a];
2604
2605 if (!attachment->clear_mask)
2606 return;
2607
2608 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2609 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2610 clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2611 info, a, false);
2612 }
2613 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2614 clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2615 info, a, true);
2616 }
2617 } else {
2618 clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2619 info, a, false);
2620 }
2621
2622 /* The spec doesn't explicitly say, but presumably the initial renderpass
2623 * clear is considered part of the renderpass, and therefore barriers
2624 * aren't required inside the subpass/renderpass. Therefore we need to
2625 * flush CCU color into CCU depth here, just like with
2626 * vkCmdClearAttachments(). Note that because this only happens at the
2627 * beginning of a renderpass, and renderpass writes are considered
2628 * "incoherent", we shouldn't have to worry about syncing depth into color
2629 * beforehand as depth should already be flushed.
2630 */
2631 if (vk_format_is_depth_or_stencil(attachment->format)) {
2632 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2633 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2634 } else {
2635 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2636 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2637 }
2638
2639 if (cmd->device->physical_device->info->a6xx.has_ccu_flush_bug)
2640 tu_cs_emit_wfi(cs);
2641 }
2642
2643 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,const VkRenderPassBeginInfo * info)2644 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2645 struct tu_cs *cs,
2646 uint32_t a,
2647 const VkRenderPassBeginInfo *info)
2648 {
2649 const struct tu_render_pass_attachment *attachment =
2650 &cmd->state.pass->attachments[a];
2651
2652 if (!attachment->clear_mask)
2653 return;
2654
2655 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2656
2657 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2658 &info->pClearValues[a]);
2659 }
2660
2661 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,bool resolve,bool separate_stencil)2662 tu_emit_blit(struct tu_cmd_buffer *cmd,
2663 struct tu_cs *cs,
2664 const struct tu_image_view *iview,
2665 const struct tu_render_pass_attachment *attachment,
2666 bool resolve,
2667 bool separate_stencil)
2668 {
2669 tu_cs_emit_regs(cs,
2670 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2671
2672 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2673 .unk0 = !resolve,
2674 .gmem = !resolve,
2675 .sample_0 = vk_format_is_int(attachment->format) |
2676 vk_format_is_depth_or_stencil(attachment->format)));
2677
2678 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2679 if (separate_stencil) {
2680 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2681 tu_cs_emit_qw(cs, iview->stencil_base_addr);
2682 tu_cs_emit(cs, iview->stencil_PITCH);
2683
2684 tu_cs_emit_regs(cs,
2685 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
2686 } else {
2687 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2688 tu_cs_image_ref_2d(cs, iview, 0, false);
2689
2690 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
2691 tu_cs_image_flag_ref(cs, iview, 0);
2692
2693 tu_cs_emit_regs(cs,
2694 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2695 }
2696
2697 tu6_emit_event_write(cmd, cs, BLIT);
2698 }
2699
2700 static bool
blit_can_resolve(VkFormat format)2701 blit_can_resolve(VkFormat format)
2702 {
2703 const struct util_format_description *desc = vk_format_description(format);
2704
2705 /* blit event can only do resolve for simple cases:
2706 * averaging samples as unsigned integers or choosing only one sample
2707 */
2708 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2709 return false;
2710
2711 /* can't do formats with larger channel sizes
2712 * note: this includes all float formats
2713 * note2: single channel integer formats seem OK
2714 */
2715 if (desc->channel[0].size > 10)
2716 return false;
2717
2718 switch (format) {
2719 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2720 * likely related to these formats having different layout from other cpp=2 formats
2721 */
2722 case VK_FORMAT_R8G8_UNORM:
2723 case VK_FORMAT_R8G8_UINT:
2724 case VK_FORMAT_R8G8_SINT:
2725 /* TODO: this one should be able to work? */
2726 case VK_FORMAT_D24_UNORM_S8_UINT:
2727 return false;
2728 default:
2729 break;
2730 }
2731
2732 return true;
2733 }
2734
2735 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,bool force_load)2736 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2737 struct tu_cs *cs,
2738 uint32_t a,
2739 bool force_load)
2740 {
2741 const struct tu_image_view *iview = cmd->state.attachments[a];
2742 const struct tu_render_pass_attachment *attachment =
2743 &cmd->state.pass->attachments[a];
2744
2745 trace_start_gmem_load(&cmd->trace, cs);
2746
2747 if (attachment->load || force_load)
2748 tu_emit_blit(cmd, cs, iview, attachment, false, false);
2749
2750 if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
2751 tu_emit_blit(cmd, cs, iview, attachment, false, true);
2752
2753 trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load);
2754 }
2755
2756 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t samples,bool separate_stencil,VkFormat format,uint32_t gmem_offset,uint32_t cpp)2757 store_cp_blit(struct tu_cmd_buffer *cmd,
2758 struct tu_cs *cs,
2759 const struct tu_image_view *iview,
2760 uint32_t samples,
2761 bool separate_stencil,
2762 VkFormat format,
2763 uint32_t gmem_offset,
2764 uint32_t cpp)
2765 {
2766 r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
2767 iview->ubwc_enabled, true);
2768 if (separate_stencil)
2769 r2d_dst_stencil(cs, iview, 0);
2770 else
2771 r2d_dst(cs, iview, 0);
2772
2773 tu_cs_emit_regs(cs,
2774 A6XX_SP_PS_2D_SRC_INFO(
2775 .color_format = tu6_format_texture(format, TILE6_2).fmt,
2776 .tile_mode = TILE6_2,
2777 .srgb = vk_format_is_srgb(format),
2778 .samples = tu_msaa_samples(samples),
2779 .samples_average = !vk_format_is_int(format) &&
2780 !vk_format_is_depth_or_stencil(format),
2781 .unk20 = 1,
2782 .unk22 = 1),
2783 /* note: src size does not matter when not scaling */
2784 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2785 A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset),
2786 A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
2787
2788 /* sync GMEM writes with CACHE. */
2789 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2790
2791 /* Wait for CACHE_INVALIDATE to land */
2792 tu_cs_emit_wfi(cs);
2793
2794 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2795 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2796
2797 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2798 * sysmem, and we generally assume that GMEM renderpasses leave their
2799 * results in sysmem, so we need to flush manually here.
2800 */
2801 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2802 }
2803
2804 static void
store_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t dst_samples,bool separate_stencil,VkFormat format,const VkRect2D * render_area,uint32_t gmem_offset,uint32_t cpp)2805 store_3d_blit(struct tu_cmd_buffer *cmd,
2806 struct tu_cs *cs,
2807 const struct tu_image_view *iview,
2808 uint32_t dst_samples,
2809 bool separate_stencil,
2810 VkFormat format,
2811 const VkRect2D *render_area,
2812 uint32_t gmem_offset,
2813 uint32_t cpp)
2814 {
2815 r3d_setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
2816 iview->ubwc_enabled, dst_samples);
2817
2818 r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2819
2820 if (separate_stencil)
2821 r3d_dst_stencil(cs, iview, 0);
2822 else
2823 r3d_dst(cs, iview, 0);
2824
2825 r3d_src_gmem(cmd, cs, iview, format, gmem_offset, cpp);
2826
2827 /* sync GMEM writes with CACHE. */
2828 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2829
2830 r3d_run(cmd, cs);
2831
2832 /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2833 * sysmem, and we generally assume that GMEM renderpasses leave their
2834 * results in sysmem, so we need to flush manually here. The 3d blit path
2835 * writes to depth images as a color RT, so there's no need to flush depth.
2836 */
2837 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2838 }
2839
2840 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,uint32_t gmem_a)2841 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2842 struct tu_cs *cs,
2843 uint32_t a,
2844 uint32_t gmem_a)
2845 {
2846 struct tu_physical_device *phys_dev = cmd->device->physical_device;
2847 const VkRect2D *render_area = &cmd->state.render_area;
2848 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2849 const struct tu_image_view *iview = cmd->state.attachments[a];
2850 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2851
2852 if (!dst->store && !dst->store_stencil)
2853 return;
2854
2855 uint32_t x1 = render_area->offset.x;
2856 uint32_t y1 = render_area->offset.y;
2857 uint32_t x2 = x1 + render_area->extent.width;
2858 uint32_t y2 = y1 + render_area->extent.height;
2859 /* x2/y2 can be unaligned if equal to the size of the image,
2860 * since it will write into padding space
2861 * the one exception is linear levels which don't have the
2862 * required y padding in the layout (except for the last level)
2863 */
2864 bool need_y2_align =
2865 y2 != iview->extent.height || iview->need_y2_align;
2866
2867 bool unaligned =
2868 x1 % phys_dev->info->gmem_align_w ||
2869 (x2 % phys_dev->info->gmem_align_w && x2 != iview->extent.width) ||
2870 y1 % phys_dev->info->gmem_align_h || (y2 % phys_dev->info->gmem_align_h && need_y2_align);
2871
2872 /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
2873 * one for depth and other for stencil. When resolving a MSAA
2874 * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
2875 */
2876 bool resolve_d32s8_s8 =
2877 src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
2878 dst->format == VK_FORMAT_S8_UINT;
2879
2880 trace_start_gmem_store(&cmd->trace, cs);
2881
2882 /* use fast path when render area is aligned, except for unsupported resolve cases */
2883 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2884 if (dst->store)
2885 tu_emit_blit(cmd, cs, iview, src, true, resolve_d32s8_s8);
2886 if (dst->store_stencil)
2887 tu_emit_blit(cmd, cs, iview, src, true, true);
2888
2889 trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false);
2890 return;
2891 }
2892
2893 VkFormat format = src->format;
2894 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
2895 format = VK_FORMAT_D32_SFLOAT;
2896
2897 if (dst->samples > 1) {
2898 /* If we hit this path, we have to disable draw states after every tile
2899 * instead of once at the end of the renderpass, so that they aren't
2900 * executed when calling CP_DRAW.
2901 *
2902 * TODO: store a flag somewhere so we don't do this more than once and
2903 * don't do it after the renderpass when this happens.
2904 */
2905 if (dst->store || dst->store_stencil)
2906 tu_disable_draw_states(cmd, cs);
2907
2908 if (dst->store) {
2909 store_3d_blit(cmd, cs, iview, dst->samples, resolve_d32s8_s8, format,
2910 render_area, src->gmem_offset, src->cpp);
2911 }
2912 if (dst->store_stencil) {
2913 store_3d_blit(cmd, cs, iview, dst->samples, true, VK_FORMAT_S8_UINT,
2914 render_area, src->gmem_offset, src->samples);
2915 }
2916 } else {
2917 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2918
2919 if (dst->store) {
2920 store_cp_blit(cmd, cs, iview, src->samples, resolve_d32s8_s8, format,
2921 src->gmem_offset, src->cpp);
2922 }
2923 if (dst->store_stencil) {
2924 store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
2925 src->gmem_offset_stencil, src->samples);
2926 }
2927 }
2928
2929 trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned);
2930 }
2931