1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #include "common/freedreno_guardband.h"
29 #include "tu_private.h"
30
31 #include "ir3/ir3_nir.h"
32 #include "main/menums.h"
33 #include "nir/nir.h"
34 #include "nir/nir_builder.h"
35 #include "spirv/nir_spirv.h"
36 #include "util/debug.h"
37 #include "util/mesa-sha1.h"
38 #include "util/u_atomic.h"
39 #include "vk_format.h"
40 #include "vk_util.h"
41
42 #include "tu_cs.h"
43
44 /* Emit IB that preloads the descriptors that the shader uses */
45
46 static void
emit_load_state(struct tu_cs * cs,unsigned opcode,enum a6xx_state_type st,enum a6xx_state_block sb,unsigned base,unsigned offset,unsigned count)47 emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
48 enum a6xx_state_block sb, unsigned base, unsigned offset,
49 unsigned count)
50 {
51 /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
52 * clear if emitting more packets will even help anything. Presumably the
53 * descriptor cache is relatively small, and these packets stop doing
54 * anything when there are too many descriptors.
55 */
56 tu_cs_emit_pkt7(cs, opcode, 3);
57 tu_cs_emit(cs,
58 CP_LOAD_STATE6_0_STATE_TYPE(st) |
59 CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
60 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
61 CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
62 tu_cs_emit_qw(cs, offset | (base << 28));
63 }
64
65 static unsigned
tu6_load_state_size(struct tu_pipeline * pipeline,bool compute)66 tu6_load_state_size(struct tu_pipeline *pipeline, bool compute)
67 {
68 const unsigned load_state_size = 4;
69 unsigned size = 0;
70 for (unsigned i = 0; i < pipeline->layout->num_sets; i++) {
71 if (!(pipeline->active_desc_sets & (1u << i)))
72 continue;
73
74 struct tu_descriptor_set_layout *set_layout = pipeline->layout->set[i].layout;
75 for (unsigned j = 0; j < set_layout->binding_count; j++) {
76 struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
77 unsigned count = 0;
78 /* Note: some users, like amber for example, pass in
79 * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
80 * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
81 */
82 VkShaderStageFlags stages = compute ?
83 binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
84 binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
85 unsigned stage_count = util_bitcount(stages);
86
87 if (!binding->array_size)
88 continue;
89
90 switch (binding->type) {
91 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
92 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
93 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
94 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
95 /* IBO-backed resources only need one packet for all graphics stages */
96 if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT)
97 count += 1;
98 if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
99 count += 1;
100 break;
101 case VK_DESCRIPTOR_TYPE_SAMPLER:
102 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
103 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
104 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
105 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
106 /* Textures and UBO's needs a packet for each stage */
107 count = stage_count;
108 break;
109 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
110 /* Because of how we pack combined images and samplers, we
111 * currently can't use one packet for the whole array.
112 */
113 count = stage_count * binding->array_size * 2;
114 break;
115 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
116 case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
117 break;
118 default:
119 unreachable("bad descriptor type");
120 }
121 size += count * load_state_size;
122 }
123 }
124 return size;
125 }
126
127 static void
tu6_emit_load_state(struct tu_pipeline * pipeline,bool compute)128 tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute)
129 {
130 unsigned size = tu6_load_state_size(pipeline, compute);
131 if (size == 0)
132 return;
133
134 struct tu_cs cs;
135 tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
136
137 struct tu_pipeline_layout *layout = pipeline->layout;
138 for (unsigned i = 0; i < layout->num_sets; i++) {
139 /* From 13.2.7. Descriptor Set Binding:
140 *
141 * A compatible descriptor set must be bound for all set numbers that
142 * any shaders in a pipeline access, at the time that a draw or
143 * dispatch command is recorded to execute using that pipeline.
144 * However, if none of the shaders in a pipeline statically use any
145 * bindings with a particular set number, then no descriptor set need
146 * be bound for that set number, even if the pipeline layout includes
147 * a non-trivial descriptor set layout for that set number.
148 *
149 * This means that descriptor sets unused by the pipeline may have a
150 * garbage or 0 BINDLESS_BASE register, which will cause context faults
151 * when prefetching descriptors from these sets. Skip prefetching for
152 * descriptors from them to avoid this. This is also an optimization,
153 * since these prefetches would be useless.
154 */
155 if (!(pipeline->active_desc_sets & (1u << i)))
156 continue;
157
158 struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
159 for (unsigned j = 0; j < set_layout->binding_count; j++) {
160 struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
161 unsigned base = i;
162 unsigned offset = binding->offset / 4;
163 /* Note: some users, like amber for example, pass in
164 * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
165 * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
166 */
167 VkShaderStageFlags stages = compute ?
168 binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
169 binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
170 unsigned count = binding->array_size;
171 if (count == 0 || stages == 0)
172 continue;
173 switch (binding->type) {
174 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
175 base = MAX_SETS;
176 offset = (layout->set[i].dynamic_offset_start +
177 binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
178 FALLTHROUGH;
179 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
180 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
181 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
182 /* IBO-backed resources only need one packet for all graphics stages */
183 if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
184 emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
185 base, offset, count);
186 }
187 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
188 emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
189 base, offset, count);
190 }
191 break;
192 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
193 case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
194 /* nothing - input attachment doesn't use bindless */
195 break;
196 case VK_DESCRIPTOR_TYPE_SAMPLER:
197 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
198 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
199 tu_foreach_stage(stage, stages) {
200 emit_load_state(&cs, tu6_stage2opcode(stage),
201 binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
202 ST6_SHADER : ST6_CONSTANTS,
203 tu6_stage2texsb(stage), base, offset, count);
204 }
205 break;
206 }
207 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
208 base = MAX_SETS;
209 offset = (layout->set[i].dynamic_offset_start +
210 binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
211 FALLTHROUGH;
212 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
213 tu_foreach_stage(stage, stages) {
214 emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
215 tu6_stage2shadersb(stage), base, offset, count);
216 }
217 break;
218 }
219 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
220 tu_foreach_stage(stage, stages) {
221 /* TODO: We could emit less CP_LOAD_STATE6 if we used
222 * struct-of-arrays instead of array-of-structs.
223 */
224 for (unsigned i = 0; i < count; i++) {
225 unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
226 unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
227 emit_load_state(&cs, tu6_stage2opcode(stage),
228 ST6_CONSTANTS, tu6_stage2texsb(stage),
229 base, tex_offset, 1);
230 emit_load_state(&cs, tu6_stage2opcode(stage),
231 ST6_SHADER, tu6_stage2texsb(stage),
232 base, sam_offset, 1);
233 }
234 }
235 break;
236 }
237 default:
238 unreachable("bad descriptor type");
239 }
240 }
241 }
242
243 pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
244 }
245
246 struct tu_pipeline_builder
247 {
248 struct tu_device *device;
249 struct tu_pipeline_cache *cache;
250 struct tu_pipeline_layout *layout;
251 const VkAllocationCallbacks *alloc;
252 const VkGraphicsPipelineCreateInfo *create_info;
253
254 struct tu_shader *shaders[MESA_SHADER_FRAGMENT + 1];
255 struct ir3_shader_variant *variants[MESA_SHADER_FRAGMENT + 1];
256 struct ir3_shader_variant *binning_variant;
257 uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1];
258 uint64_t binning_vs_iova;
259
260 uint32_t additional_cs_reserve_size;
261
262 struct tu_pvtmem_config pvtmem;
263
264 bool rasterizer_discard;
265 /* these states are affectd by rasterizer_discard */
266 bool emit_msaa_state;
267 VkSampleCountFlagBits samples;
268 bool use_color_attachments;
269 bool use_dual_src_blend;
270 bool alpha_to_coverage;
271 uint32_t color_attachment_count;
272 VkFormat color_attachment_formats[MAX_RTS];
273 VkFormat depth_attachment_format;
274 uint32_t render_components;
275 uint32_t multiview_mask;
276 };
277
278 static bool
tu_logic_op_reads_dst(VkLogicOp op)279 tu_logic_op_reads_dst(VkLogicOp op)
280 {
281 switch (op) {
282 case VK_LOGIC_OP_CLEAR:
283 case VK_LOGIC_OP_COPY:
284 case VK_LOGIC_OP_COPY_INVERTED:
285 case VK_LOGIC_OP_SET:
286 return false;
287 default:
288 return true;
289 }
290 }
291
292 static VkBlendFactor
tu_blend_factor_no_dst_alpha(VkBlendFactor factor)293 tu_blend_factor_no_dst_alpha(VkBlendFactor factor)
294 {
295 /* treat dst alpha as 1.0 and avoid reading it */
296 switch (factor) {
297 case VK_BLEND_FACTOR_DST_ALPHA:
298 return VK_BLEND_FACTOR_ONE;
299 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
300 return VK_BLEND_FACTOR_ZERO;
301 default:
302 return factor;
303 }
304 }
305
tu_blend_factor_is_dual_src(VkBlendFactor factor)306 static bool tu_blend_factor_is_dual_src(VkBlendFactor factor)
307 {
308 switch (factor) {
309 case VK_BLEND_FACTOR_SRC1_COLOR:
310 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
311 case VK_BLEND_FACTOR_SRC1_ALPHA:
312 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
313 return true;
314 default:
315 return false;
316 }
317 }
318
319 static bool
tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo * info)320 tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info)
321 {
322 if (!info)
323 return false;
324
325 for (unsigned i = 0; i < info->attachmentCount; i++) {
326 const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i];
327 if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) ||
328 tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) ||
329 tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) ||
330 tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor))
331 return true;
332 }
333
334 return false;
335 }
336
337 static const struct xs_config {
338 uint16_t reg_sp_xs_ctrl;
339 uint16_t reg_sp_xs_config;
340 uint16_t reg_sp_xs_instrlen;
341 uint16_t reg_hlsq_xs_ctrl;
342 uint16_t reg_sp_xs_first_exec_offset;
343 uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
344 } xs_config[] = {
345 [MESA_SHADER_VERTEX] = {
346 REG_A6XX_SP_VS_CTRL_REG0,
347 REG_A6XX_SP_VS_CONFIG,
348 REG_A6XX_SP_VS_INSTRLEN,
349 REG_A6XX_HLSQ_VS_CNTL,
350 REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
351 REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
352 },
353 [MESA_SHADER_TESS_CTRL] = {
354 REG_A6XX_SP_HS_CTRL_REG0,
355 REG_A6XX_SP_HS_CONFIG,
356 REG_A6XX_SP_HS_INSTRLEN,
357 REG_A6XX_HLSQ_HS_CNTL,
358 REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
359 REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
360 },
361 [MESA_SHADER_TESS_EVAL] = {
362 REG_A6XX_SP_DS_CTRL_REG0,
363 REG_A6XX_SP_DS_CONFIG,
364 REG_A6XX_SP_DS_INSTRLEN,
365 REG_A6XX_HLSQ_DS_CNTL,
366 REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
367 REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
368 },
369 [MESA_SHADER_GEOMETRY] = {
370 REG_A6XX_SP_GS_CTRL_REG0,
371 REG_A6XX_SP_GS_CONFIG,
372 REG_A6XX_SP_GS_INSTRLEN,
373 REG_A6XX_HLSQ_GS_CNTL,
374 REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
375 REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
376 },
377 [MESA_SHADER_FRAGMENT] = {
378 REG_A6XX_SP_FS_CTRL_REG0,
379 REG_A6XX_SP_FS_CONFIG,
380 REG_A6XX_SP_FS_INSTRLEN,
381 REG_A6XX_HLSQ_FS_CNTL,
382 REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
383 REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
384 },
385 [MESA_SHADER_COMPUTE] = {
386 REG_A6XX_SP_CS_CTRL_REG0,
387 REG_A6XX_SP_CS_CONFIG,
388 REG_A6XX_SP_CS_INSTRLEN,
389 REG_A6XX_HLSQ_CS_CNTL,
390 REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
391 REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
392 },
393 };
394
395 static uint32_t
tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant * xs)396 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
397 {
398 const struct ir3_const_state *const_state = ir3_const_state(xs);
399 uint32_t base = const_state->offsets.immediate;
400 int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
401
402 /* truncate size to avoid writing constants that shader
403 * does not use:
404 */
405 size = MIN2(size + base, xs->constlen) - base;
406
407 return MAX2(size, 0) * 4;
408 }
409
410 /* We allocate fixed-length substreams for shader state, however some
411 * parts of the state may have unbound length. Their additional space
412 * requirements should be calculated here.
413 */
414 static uint32_t
tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant * xs)415 tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
416 {
417 uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
418 return size;
419 }
420
421 void
tu6_emit_xs_config(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs)422 tu6_emit_xs_config(struct tu_cs *cs,
423 gl_shader_stage stage, /* xs->type, but xs may be NULL */
424 const struct ir3_shader_variant *xs)
425 {
426 const struct xs_config *cfg = &xs_config[stage];
427
428 if (!xs) {
429 /* shader stage disabled */
430 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
431 tu_cs_emit(cs, 0);
432
433 tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
434 tu_cs_emit(cs, 0);
435 return;
436 }
437
438 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
439 tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
440 COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
441 COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
442 COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
443 COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
444 A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
445 A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
446
447 tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
448 tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
449 A6XX_HLSQ_VS_CNTL_ENABLED);
450 }
451
452 void
tu6_emit_xs(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)453 tu6_emit_xs(struct tu_cs *cs,
454 gl_shader_stage stage, /* xs->type, but xs may be NULL */
455 const struct ir3_shader_variant *xs,
456 const struct tu_pvtmem_config *pvtmem,
457 uint64_t binary_iova)
458 {
459 const struct xs_config *cfg = &xs_config[stage];
460
461 if (!xs) {
462 /* shader stage disabled */
463 return;
464 }
465
466 enum a6xx_threadsize thrsz =
467 xs->info.double_threadsize ? THREAD128 : THREAD64;
468 switch (stage) {
469 case MESA_SHADER_VERTEX:
470 tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
471 .fullregfootprint = xs->info.max_reg + 1,
472 .halfregfootprint = xs->info.max_half_reg + 1,
473 .branchstack = ir3_shader_branchstack_hw(xs),
474 .mergedregs = xs->mergedregs,
475 ));
476 break;
477 case MESA_SHADER_TESS_CTRL:
478 tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
479 .fullregfootprint = xs->info.max_reg + 1,
480 .halfregfootprint = xs->info.max_half_reg + 1,
481 .branchstack = ir3_shader_branchstack_hw(xs),
482 ));
483 break;
484 case MESA_SHADER_TESS_EVAL:
485 tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
486 .fullregfootprint = xs->info.max_reg + 1,
487 .halfregfootprint = xs->info.max_half_reg + 1,
488 .branchstack = ir3_shader_branchstack_hw(xs),
489 .mergedregs = xs->mergedregs,
490 ));
491 break;
492 case MESA_SHADER_GEOMETRY:
493 tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
494 .fullregfootprint = xs->info.max_reg + 1,
495 .halfregfootprint = xs->info.max_half_reg + 1,
496 .branchstack = ir3_shader_branchstack_hw(xs),
497 ));
498 break;
499 case MESA_SHADER_FRAGMENT:
500 tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
501 .fullregfootprint = xs->info.max_reg + 1,
502 .halfregfootprint = xs->info.max_half_reg + 1,
503 .branchstack = ir3_shader_branchstack_hw(xs),
504 .mergedregs = xs->mergedregs,
505 .threadsize = thrsz,
506 .pixlodenable = xs->need_pixlod,
507 .diff_fine = xs->need_fine_derivatives,
508 .varying = xs->total_in != 0,
509 /* unknown bit, seems unnecessary */
510 .unk24 = true,
511 ));
512 break;
513 case MESA_SHADER_COMPUTE:
514 tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
515 .fullregfootprint = xs->info.max_reg + 1,
516 .halfregfootprint = xs->info.max_half_reg + 1,
517 .branchstack = ir3_shader_branchstack_hw(xs),
518 .mergedregs = xs->mergedregs,
519 .threadsize = thrsz,
520 ));
521 break;
522 default:
523 unreachable("bad shader stage");
524 }
525
526 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
527 tu_cs_emit(cs, xs->instrlen);
528
529 /* emit program binary & private memory layout
530 * binary_iova should be aligned to 1 instrlen unit (128 bytes)
531 */
532
533 assert((binary_iova & 0x7f) == 0);
534 assert((pvtmem->iova & 0x1f) == 0);
535
536 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
537 tu_cs_emit(cs, 0);
538 tu_cs_emit_qw(cs, binary_iova);
539 tu_cs_emit(cs,
540 A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
541 tu_cs_emit_qw(cs, pvtmem->iova);
542 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
543 COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
544
545 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
546 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
547
548 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
549 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
550 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
551 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
552 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
553 CP_LOAD_STATE6_0_NUM_UNIT(xs->instrlen));
554 tu_cs_emit_qw(cs, binary_iova);
555
556 /* emit immediates */
557
558 const struct ir3_const_state *const_state = ir3_const_state(xs);
559 uint32_t base = const_state->offsets.immediate;
560 unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
561
562 if (immediate_size > 0) {
563 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
564 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
565 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
566 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
567 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
568 CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
569 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
570 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
571
572 tu_cs_emit_array(cs, const_state->immediates, immediate_size);
573 }
574
575 if (const_state->constant_data_ubo != -1) {
576 uint64_t iova = binary_iova + xs->info.constant_data_offset;
577
578 /* Upload UBO state for the constant data. */
579 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
580 tu_cs_emit(cs,
581 CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
582 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
583 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
584 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
585 CP_LOAD_STATE6_0_NUM_UNIT(1));
586 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
587 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
588 int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
589 tu_cs_emit_qw(cs,
590 iova |
591 (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
592
593 /* Upload the constant data to the const file if needed. */
594 const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
595
596 for (int i = 0; i < ubo_state->num_enabled; i++) {
597 if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
598 ubo_state->range[i].ubo.bindless) {
599 continue;
600 }
601
602 uint32_t start = ubo_state->range[i].start;
603 uint32_t end = ubo_state->range[i].end;
604 uint32_t size = MIN2(end - start,
605 (16 * xs->constlen) - ubo_state->range[i].offset);
606
607 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
608 tu_cs_emit(cs,
609 CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
610 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
611 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
612 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
613 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
614 tu_cs_emit_qw(cs, iova + start);
615 }
616 }
617 }
618
619 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct tu_shader * shader,const struct ir3_shader_variant * v,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)620 tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
621 const struct ir3_shader_variant *v,
622 const struct tu_pvtmem_config *pvtmem,
623 uint64_t binary_iova)
624 {
625 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
626 .cs_state = true,
627 .cs_ibo = true));
628
629 tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v);
630 tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
631
632 uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
633 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
634 tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
635 A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
636
637 if (cs->device->physical_device->info->a6xx.has_lpac) {
638 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
639 tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
640 A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
641 }
642
643 uint32_t local_invocation_id =
644 ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
645 uint32_t work_group_id =
646 ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
647
648 enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
649 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
650 tu_cs_emit(cs,
651 A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
652 A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
653 A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
654 A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
655 tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
656 A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
657
658 if (cs->device->physical_device->info->a6xx.has_lpac) {
659 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
660 tu_cs_emit(cs,
661 A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
662 A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
663 A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
664 A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
665 tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
666 A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
667 }
668 }
669
670 static void
tu6_emit_vs_system_values(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,bool primid_passthru)671 tu6_emit_vs_system_values(struct tu_cs *cs,
672 const struct ir3_shader_variant *vs,
673 const struct ir3_shader_variant *hs,
674 const struct ir3_shader_variant *ds,
675 const struct ir3_shader_variant *gs,
676 bool primid_passthru)
677 {
678 const uint32_t vertexid_regid =
679 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
680 const uint32_t instanceid_regid =
681 ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
682 const uint32_t tess_coord_x_regid = hs ?
683 ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) :
684 regid(63, 0);
685 const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
686 tess_coord_x_regid + 1 :
687 regid(63, 0);
688 const uint32_t hs_rel_patch_regid = hs ?
689 ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
690 regid(63, 0);
691 const uint32_t ds_rel_patch_regid = hs ?
692 ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
693 regid(63, 0);
694 const uint32_t hs_invocation_regid = hs ?
695 ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) :
696 regid(63, 0);
697 const uint32_t gs_primitiveid_regid = gs ?
698 ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) :
699 regid(63, 0);
700 const uint32_t vs_primitiveid_regid = hs ?
701 ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) :
702 gs_primitiveid_regid;
703 const uint32_t ds_primitiveid_regid = ds ?
704 ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) :
705 regid(63, 0);
706 const uint32_t gsheader_regid = gs ?
707 ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) :
708 regid(63, 0);
709
710 /* Note: we currently don't support multiview with tess or GS. If we did,
711 * and the HW actually works, then we'd have to somehow share this across
712 * stages. Note that the blob doesn't support this either.
713 */
714 const uint32_t viewid_regid =
715 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
716
717 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6);
718 tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
719 A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
720 A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
721 A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
722 tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
723 A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
724 tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
725 A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
726 A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
727 A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
728 tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
729 tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
730 0xfc00); /* VFD_CONTROL_5 */
731 tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */
732 }
733
734 static void
tu6_setup_streamout(struct tu_cs * cs,const struct ir3_shader_variant * v,struct ir3_shader_linkage * l)735 tu6_setup_streamout(struct tu_cs *cs,
736 const struct ir3_shader_variant *v,
737 struct ir3_shader_linkage *l)
738 {
739 const struct ir3_stream_output_info *info = &v->shader->stream_output;
740 /* Note: 64 here comes from the HW layout of the program RAM. The program
741 * for stream N is at DWORD 64 * N.
742 */
743 #define A6XX_SO_PROG_DWORDS 64
744 uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
745 BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
746 uint32_t ncomp[IR3_MAX_SO_BUFFERS] = {};
747
748 /* TODO: streamout state should be in a non-GMEM draw state */
749
750 /* no streamout: */
751 if (info->num_outputs == 0) {
752 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
753 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
754 tu_cs_emit(cs, 0);
755 tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
756 tu_cs_emit(cs, 0);
757 return;
758 }
759
760 /* is there something to do with info->stride[i]? */
761
762 for (unsigned i = 0; i < info->num_outputs; i++) {
763 const struct ir3_stream_output *out = &info->output[i];
764 unsigned k = out->register_index;
765 unsigned idx;
766
767 /* Skip it, if it's an output that was never assigned a register. */
768 if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
769 continue;
770
771 ncomp[out->output_buffer] += out->num_components;
772
773 /* linkage map sorted by order frag shader wants things, so
774 * a bit less ideal here..
775 */
776 for (idx = 0; idx < l->cnt; idx++)
777 if (l->var[idx].regid == v->outputs[k].regid)
778 break;
779
780 debug_assert(idx < l->cnt);
781
782 for (unsigned j = 0; j < out->num_components; j++) {
783 unsigned c = j + out->start_component;
784 unsigned loc = l->var[idx].loc + c;
785 unsigned off = j + out->dst_offset; /* in dwords */
786
787 assert(loc < A6XX_SO_PROG_DWORDS * 2);
788 unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
789 if (loc & 1) {
790 prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
791 A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
792 A6XX_VPC_SO_PROG_B_OFF(off * 4);
793 } else {
794 prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
795 A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
796 A6XX_VPC_SO_PROG_A_OFF(off * 4);
797 }
798 BITSET_SET(valid_dwords, dword);
799 }
800 }
801
802 unsigned prog_count = 0;
803 unsigned start, end;
804 BITSET_FOREACH_RANGE(start, end, valid_dwords,
805 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
806 prog_count += end - start + 1;
807 }
808
809 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
810 tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
811 tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
812 COND(ncomp[0] > 0,
813 A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
814 COND(ncomp[1] > 0,
815 A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
816 COND(ncomp[2] > 0,
817 A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
818 COND(ncomp[3] > 0,
819 A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
820 for (uint32_t i = 0; i < 4; i++) {
821 tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(i));
822 tu_cs_emit(cs, ncomp[i]);
823 }
824 bool first = true;
825 BITSET_FOREACH_RANGE(start, end, valid_dwords,
826 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
827 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
828 tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
829 A6XX_VPC_SO_CNTL_ADDR(start));
830 for (unsigned i = start; i < end; i++) {
831 tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
832 tu_cs_emit(cs, prog[i]);
833 }
834 first = false;
835 }
836 }
837
838 static void
tu6_emit_const(struct tu_cs * cs,uint32_t opcode,uint32_t base,enum a6xx_state_block block,uint32_t offset,uint32_t size,const uint32_t * dwords)839 tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base,
840 enum a6xx_state_block block, uint32_t offset,
841 uint32_t size, const uint32_t *dwords) {
842 assert(size % 4 == 0);
843
844 tu_cs_emit_pkt7(cs, opcode, 3 + size);
845 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
846 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
847 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
848 CP_LOAD_STATE6_0_STATE_BLOCK(block) |
849 CP_LOAD_STATE6_0_NUM_UNIT(size / 4));
850
851 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
852 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
853 dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
854
855 tu_cs_emit_array(cs, dwords, size);
856 }
857
858 static void
tu6_emit_link_map(struct tu_cs * cs,const struct ir3_shader_variant * producer,const struct ir3_shader_variant * consumer,enum a6xx_state_block sb)859 tu6_emit_link_map(struct tu_cs *cs,
860 const struct ir3_shader_variant *producer,
861 const struct ir3_shader_variant *consumer,
862 enum a6xx_state_block sb)
863 {
864 const struct ir3_const_state *const_state = ir3_const_state(consumer);
865 uint32_t base = const_state->offsets.primitive_map;
866 int size = DIV_ROUND_UP(consumer->input_size, 4);
867
868 size = (MIN2(size + base, consumer->constlen) - base) * 4;
869 if (size <= 0)
870 return;
871
872 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size,
873 producer->output_loc);
874 }
875
876 static uint16_t
gl_primitive_to_tess(uint16_t primitive)877 gl_primitive_to_tess(uint16_t primitive) {
878 switch (primitive) {
879 case GL_POINTS:
880 return TESS_POINTS;
881 case GL_LINE_STRIP:
882 return TESS_LINES;
883 case GL_TRIANGLE_STRIP:
884 return TESS_CW_TRIS;
885 default:
886 unreachable("");
887 }
888 }
889
890 void
tu6_emit_vpc(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs,uint32_t patch_control_points)891 tu6_emit_vpc(struct tu_cs *cs,
892 const struct ir3_shader_variant *vs,
893 const struct ir3_shader_variant *hs,
894 const struct ir3_shader_variant *ds,
895 const struct ir3_shader_variant *gs,
896 const struct ir3_shader_variant *fs,
897 uint32_t patch_control_points)
898 {
899 /* note: doesn't compile as static because of the array regs.. */
900 const struct reg_config {
901 uint16_t reg_sp_xs_out_reg;
902 uint16_t reg_sp_xs_vpc_dst_reg;
903 uint16_t reg_vpc_xs_pack;
904 uint16_t reg_vpc_xs_clip_cntl;
905 uint16_t reg_gras_xs_cl_cntl;
906 uint16_t reg_pc_xs_out_cntl;
907 uint16_t reg_sp_xs_primitive_cntl;
908 uint16_t reg_vpc_xs_layer_cntl;
909 uint16_t reg_gras_xs_layer_cntl;
910 } reg_config[] = {
911 [MESA_SHADER_VERTEX] = {
912 REG_A6XX_SP_VS_OUT_REG(0),
913 REG_A6XX_SP_VS_VPC_DST_REG(0),
914 REG_A6XX_VPC_VS_PACK,
915 REG_A6XX_VPC_VS_CLIP_CNTL,
916 REG_A6XX_GRAS_VS_CL_CNTL,
917 REG_A6XX_PC_VS_OUT_CNTL,
918 REG_A6XX_SP_VS_PRIMITIVE_CNTL,
919 REG_A6XX_VPC_VS_LAYER_CNTL,
920 REG_A6XX_GRAS_VS_LAYER_CNTL
921 },
922 [MESA_SHADER_TESS_CTRL] = {
923 0,
924 0,
925 0,
926 0,
927 0,
928 REG_A6XX_PC_HS_OUT_CNTL,
929 0,
930 0,
931 0
932 },
933 [MESA_SHADER_TESS_EVAL] = {
934 REG_A6XX_SP_DS_OUT_REG(0),
935 REG_A6XX_SP_DS_VPC_DST_REG(0),
936 REG_A6XX_VPC_DS_PACK,
937 REG_A6XX_VPC_DS_CLIP_CNTL,
938 REG_A6XX_GRAS_DS_CL_CNTL,
939 REG_A6XX_PC_DS_OUT_CNTL,
940 REG_A6XX_SP_DS_PRIMITIVE_CNTL,
941 REG_A6XX_VPC_DS_LAYER_CNTL,
942 REG_A6XX_GRAS_DS_LAYER_CNTL
943 },
944 [MESA_SHADER_GEOMETRY] = {
945 REG_A6XX_SP_GS_OUT_REG(0),
946 REG_A6XX_SP_GS_VPC_DST_REG(0),
947 REG_A6XX_VPC_GS_PACK,
948 REG_A6XX_VPC_GS_CLIP_CNTL,
949 REG_A6XX_GRAS_GS_CL_CNTL,
950 REG_A6XX_PC_GS_OUT_CNTL,
951 REG_A6XX_SP_GS_PRIMITIVE_CNTL,
952 REG_A6XX_VPC_GS_LAYER_CNTL,
953 REG_A6XX_GRAS_GS_LAYER_CNTL
954 },
955 };
956
957 const struct ir3_shader_variant *last_shader;
958 if (gs) {
959 last_shader = gs;
960 } else if (hs) {
961 last_shader = ds;
962 } else {
963 last_shader = vs;
964 }
965
966 const struct reg_config *cfg = ®_config[last_shader->type];
967
968 struct ir3_shader_linkage linkage = {
969 .primid_loc = 0xff,
970 .clip0_loc = 0xff,
971 .clip1_loc = 0xff,
972 };
973 if (fs)
974 ir3_link_shaders(&linkage, last_shader, fs, true);
975
976 if (last_shader->shader->stream_output.num_outputs)
977 ir3_link_stream_out(&linkage, last_shader);
978
979 /* We do this after linking shaders in order to know whether PrimID
980 * passthrough needs to be enabled.
981 */
982 bool primid_passthru = linkage.primid_loc != 0xff;
983 tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru);
984
985 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
986 tu_cs_emit(cs, ~linkage.varmask[0]);
987 tu_cs_emit(cs, ~linkage.varmask[1]);
988 tu_cs_emit(cs, ~linkage.varmask[2]);
989 tu_cs_emit(cs, ~linkage.varmask[3]);
990
991 /* a6xx finds position/pointsize at the end */
992 const uint32_t pointsize_regid =
993 ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
994 const uint32_t layer_regid =
995 ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
996 const uint32_t view_regid =
997 ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
998 const uint32_t clip0_regid =
999 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
1000 const uint32_t clip1_regid =
1001 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
1002 uint32_t flags_regid = gs ?
1003 ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
1004
1005 uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
1006
1007 if (layer_regid != regid(63, 0)) {
1008 layer_loc = linkage.max_loc;
1009 ir3_link_add(&linkage, layer_regid, 0x1, linkage.max_loc);
1010 }
1011
1012 if (view_regid != regid(63, 0)) {
1013 view_loc = linkage.max_loc;
1014 ir3_link_add(&linkage, view_regid, 0x1, linkage.max_loc);
1015 }
1016
1017 unsigned extra_pos = 0;
1018
1019 for (unsigned i = 0; i < last_shader->outputs_count; i++) {
1020 if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
1021 continue;
1022
1023 if (position_loc == 0xff)
1024 position_loc = linkage.max_loc;
1025
1026 ir3_link_add(&linkage, last_shader->outputs[i].regid,
1027 0xf, position_loc + 4 * last_shader->outputs[i].view);
1028 extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
1029 }
1030
1031 if (pointsize_regid != regid(63, 0)) {
1032 pointsize_loc = linkage.max_loc;
1033 ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc);
1034 }
1035
1036 uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
1037
1038 /* Handle the case where clip/cull distances aren't read by the FS */
1039 uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
1040 if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
1041 clip0_loc = linkage.max_loc;
1042 ir3_link_add(&linkage, clip0_regid, clip_cull_mask & 0xf, linkage.max_loc);
1043 }
1044 if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
1045 clip1_loc = linkage.max_loc;
1046 ir3_link_add(&linkage, clip1_regid, clip_cull_mask >> 4, linkage.max_loc);
1047 }
1048
1049 tu6_setup_streamout(cs, last_shader, &linkage);
1050
1051 /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
1052 * at least when a DS is the last stage, so add a dummy output to keep it
1053 * happy if there aren't any. We do this late in order to avoid emitting
1054 * any unused code and make sure that optimizations don't remove it.
1055 */
1056 if (linkage.cnt == 0)
1057 ir3_link_add(&linkage, 0, 0x1, linkage.max_loc);
1058
1059 /* map outputs of the last shader to VPC */
1060 assert(linkage.cnt <= 32);
1061 const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
1062 const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
1063 uint32_t sp_out[16] = {0};
1064 uint32_t sp_vpc_dst[8] = {0};
1065 for (uint32_t i = 0; i < linkage.cnt; i++) {
1066 ((uint16_t *) sp_out)[i] =
1067 A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
1068 A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
1069 ((uint8_t *) sp_vpc_dst)[i] =
1070 A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
1071 }
1072
1073 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
1074 tu_cs_emit_array(cs, sp_out, sp_out_count);
1075
1076 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
1077 tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
1078
1079 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
1080 tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
1081 A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
1082 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
1083 A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
1084
1085 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
1086 tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1087 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1088 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1089
1090 tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
1091 tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
1092 A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
1093
1094 const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
1095
1096 for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
1097 const struct ir3_shader_variant *shader = geom_shaders[i];
1098 if (!shader)
1099 continue;
1100
1101 bool primid = shader->type != MESA_SHADER_VERTEX &&
1102 VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
1103
1104 tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
1105 if (shader == last_shader) {
1106 tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
1107 CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
1108 CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
1109 CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
1110 COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
1111 A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
1112 } else {
1113 tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
1114 }
1115 }
1116
1117 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
1118 tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
1119 A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
1120
1121 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
1122 tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1123 A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc));
1124
1125 tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
1126 tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
1127 CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
1128
1129 tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru));
1130
1131 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1132 tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) |
1133 COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1134 A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) |
1135 A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc));
1136
1137 if (hs) {
1138 shader_info *hs_info = &hs->shader->nir->info;
1139
1140 tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1141 tu_cs_emit(cs, hs_info->tess.tcs_vertices_out);
1142
1143 /* Total attribute slots in HS incoming patch. */
1144 tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1145 tu_cs_emit(cs, patch_control_points * vs->output_size / 4);
1146
1147 const uint32_t wavesize = 64;
1148 const uint32_t max_wave_input_size = 64;
1149
1150 /* note: if HS is really just the VS extended, then this
1151 * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out)
1152 * however that doesn't match the blob, and fails some dEQP tests.
1153 */
1154 uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out;
1155 uint32_t max_prims_per_wave =
1156 max_wave_input_size * wavesize / (vs->output_size * patch_control_points);
1157 prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave);
1158
1159 uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave;
1160 uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize);
1161
1162 tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1163 tu_cs_emit(cs, wave_input_size);
1164
1165 /* In SPIR-V generated from GLSL, the tessellation primitive params are
1166 * are specified in the tess eval shader, but in SPIR-V generated from
1167 * HLSL, they are specified in the tess control shader. */
1168 shader_info *tess_info =
1169 ds->shader->nir->info.tess.spacing == TESS_SPACING_UNSPECIFIED ?
1170 &hs->shader->nir->info : &ds->shader->nir->info;
1171 tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1);
1172 uint32_t output;
1173 if (tess_info->tess.point_mode)
1174 output = TESS_POINTS;
1175 else if (tess_info->tess.primitive_mode == GL_ISOLINES)
1176 output = TESS_LINES;
1177 else if (tess_info->tess.ccw)
1178 output = TESS_CCW_TRIS;
1179 else
1180 output = TESS_CW_TRIS;
1181
1182 enum a6xx_tess_spacing spacing;
1183 switch (tess_info->tess.spacing) {
1184 case TESS_SPACING_EQUAL:
1185 spacing = TESS_EQUAL;
1186 break;
1187 case TESS_SPACING_FRACTIONAL_ODD:
1188 spacing = TESS_FRACTIONAL_ODD;
1189 break;
1190 case TESS_SPACING_FRACTIONAL_EVEN:
1191 spacing = TESS_FRACTIONAL_EVEN;
1192 break;
1193 case TESS_SPACING_UNSPECIFIED:
1194 default:
1195 unreachable("invalid tess spacing");
1196 }
1197 tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) |
1198 A6XX_PC_TESS_CNTL_OUTPUT(output));
1199
1200 tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1201 tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1202 }
1203
1204
1205 if (gs) {
1206 uint32_t vertices_out, invocations, output, vec4_size;
1207 uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
1208
1209 /* this detects the tu_clear_blit path, which doesn't set ->nir */
1210 if (gs->shader->nir) {
1211 if (hs) {
1212 tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1213 } else {
1214 tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1215 }
1216 vertices_out = gs->shader->nir->info.gs.vertices_out - 1;
1217 output = gl_primitive_to_tess(gs->shader->nir->info.gs.output_primitive);
1218 invocations = gs->shader->nir->info.gs.invocations - 1;
1219 /* Size of per-primitive alloction in ldlw memory in vec4s. */
1220 vec4_size = gs->shader->nir->info.gs.vertices_in *
1221 DIV_ROUND_UP(prev_stage_output_size, 4);
1222 } else {
1223 vertices_out = 3;
1224 output = TESS_CW_TRIS;
1225 invocations = 0;
1226 vec4_size = 0;
1227 }
1228
1229 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
1230 tu_cs_emit(cs,
1231 A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
1232 A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
1233 A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
1234
1235 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
1236 tu_cs_emit(cs, 0xff);
1237
1238 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1239 tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1240
1241 uint32_t prim_size = prev_stage_output_size;
1242 if (prim_size > 64)
1243 prim_size = 64;
1244 else if (prim_size == 64)
1245 prim_size = 63;
1246 tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1247 tu_cs_emit(cs, prim_size);
1248 }
1249 }
1250
1251 static int
tu6_vpc_varying_mode(const struct ir3_shader_variant * fs,uint32_t index,uint8_t * interp_mode,uint8_t * ps_repl_mode)1252 tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
1253 uint32_t index,
1254 uint8_t *interp_mode,
1255 uint8_t *ps_repl_mode)
1256 {
1257 enum
1258 {
1259 INTERP_SMOOTH = 0,
1260 INTERP_FLAT = 1,
1261 INTERP_ZERO = 2,
1262 INTERP_ONE = 3,
1263 };
1264 enum
1265 {
1266 PS_REPL_NONE = 0,
1267 PS_REPL_S = 1,
1268 PS_REPL_T = 2,
1269 PS_REPL_ONE_MINUS_T = 3,
1270 };
1271
1272 const uint32_t compmask = fs->inputs[index].compmask;
1273
1274 /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
1275 * fourth component occupy three consecutive varying slots
1276 */
1277 int shift = 0;
1278 *interp_mode = 0;
1279 *ps_repl_mode = 0;
1280 if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
1281 if (compmask & 0x1) {
1282 *ps_repl_mode |= PS_REPL_S << shift;
1283 shift += 2;
1284 }
1285 if (compmask & 0x2) {
1286 *ps_repl_mode |= PS_REPL_T << shift;
1287 shift += 2;
1288 }
1289 if (compmask & 0x4) {
1290 *interp_mode |= INTERP_ZERO << shift;
1291 shift += 2;
1292 }
1293 if (compmask & 0x8) {
1294 *interp_mode |= INTERP_ONE << 6;
1295 shift += 2;
1296 }
1297 } else if (fs->inputs[index].flat) {
1298 for (int i = 0; i < 4; i++) {
1299 if (compmask & (1 << i)) {
1300 *interp_mode |= INTERP_FLAT << shift;
1301 shift += 2;
1302 }
1303 }
1304 }
1305
1306 return shift;
1307 }
1308
1309 static void
tu6_emit_vpc_varying_modes(struct tu_cs * cs,const struct ir3_shader_variant * fs)1310 tu6_emit_vpc_varying_modes(struct tu_cs *cs,
1311 const struct ir3_shader_variant *fs)
1312 {
1313 uint32_t interp_modes[8] = { 0 };
1314 uint32_t ps_repl_modes[8] = { 0 };
1315
1316 if (fs) {
1317 for (int i = -1;
1318 (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
1319
1320 /* get the mode for input i */
1321 uint8_t interp_mode;
1322 uint8_t ps_repl_mode;
1323 const int bits =
1324 tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode);
1325
1326 /* OR the mode into the array */
1327 const uint32_t inloc = fs->inputs[i].inloc * 2;
1328 uint32_t n = inloc / 32;
1329 uint32_t shift = inloc % 32;
1330 interp_modes[n] |= interp_mode << shift;
1331 ps_repl_modes[n] |= ps_repl_mode << shift;
1332 if (shift + bits > 32) {
1333 n++;
1334 shift = 32 - shift;
1335
1336 interp_modes[n] |= interp_mode >> shift;
1337 ps_repl_modes[n] |= ps_repl_mode >> shift;
1338 }
1339 }
1340 }
1341
1342 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
1343 tu_cs_emit_array(cs, interp_modes, 8);
1344
1345 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
1346 tu_cs_emit_array(cs, ps_repl_modes, 8);
1347 }
1348
1349 void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1350 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1351 {
1352 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1353 uint32_t ij_regid[IJ_COUNT];
1354 uint32_t smask_in_regid;
1355
1356 bool sample_shading = fs->per_samp | fs->key.sample_shading;
1357 bool enable_varyings = fs->total_in > 0;
1358
1359 samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1360 smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1361 face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1362 coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1363 zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1364 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1365 ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1366
1367 if (fs->num_sampler_prefetch > 0) {
1368 assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL]));
1369 /* also, it seems like ij_pix is *required* to be r0.x */
1370 assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1371 }
1372
1373 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1374 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1375 A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) |
1376 0x7000); // XXX);
1377 for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1378 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1379 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) |
1380 A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) |
1381 A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) |
1382 A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) |
1383 A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) |
1384 COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) |
1385 A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd));
1386 }
1387
1388 if (fs->num_sampler_prefetch > 0) {
1389 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1390 for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1391 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1392 tu_cs_emit(cs,
1393 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1394 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1395 }
1396 }
1397
1398 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
1399 tu_cs_emit(cs, 0x7);
1400 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
1401 A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) |
1402 A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) |
1403 A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE]));
1404 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) |
1405 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) |
1406 A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) |
1407 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID]));
1408 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
1409 A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) |
1410 A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) |
1411 A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE]));
1412 tu_cs_emit(cs, 0xfcfc);
1413
1414 enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1415 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1416 tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) |
1417 COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS));
1418
1419 bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1420 bool need_size_persamp = false;
1421 if (VALIDREG(ij_regid[IJ_PERSP_SIZE])) {
1422 if (sample_shading)
1423 need_size_persamp = true;
1424 else
1425 need_size = true;
1426 }
1427
1428 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1429 tu_cs_emit(cs,
1430 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1431 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1432 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1433 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1434 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1435 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1436 COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1437 COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1438 COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1439
1440 tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1441 tu_cs_emit(cs,
1442 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1443 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1444 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1445 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1446 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1447 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1448 COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1449 COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1450 COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1451 COND(fs->fragcoord_compmask != 0,
1452 A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1453 tu_cs_emit(cs,
1454 A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1455 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1456 CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1457 CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1458 CONDREG(ij_regid[IJ_PERSP_SIZE], A6XX_RB_RENDER_CONTROL1_SIZE) |
1459 COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1460
1461 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1462 tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1463
1464 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1465 tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1466 A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1467 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1468
1469 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1470 tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1471 }
1472
1473 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs,uint32_t mrt_count,bool dual_src_blend,uint32_t render_components,bool no_earlyz,struct tu_pipeline * pipeline)1474 tu6_emit_fs_outputs(struct tu_cs *cs,
1475 const struct ir3_shader_variant *fs,
1476 uint32_t mrt_count, bool dual_src_blend,
1477 uint32_t render_components,
1478 bool no_earlyz,
1479 struct tu_pipeline *pipeline)
1480 {
1481 uint32_t smask_regid, posz_regid, stencilref_regid;
1482
1483 posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1484 smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1485 stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1486
1487 uint32_t fragdata_regid[8];
1488 if (fs->color0_mrt) {
1489 fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR);
1490 for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++)
1491 fragdata_regid[i] = fragdata_regid[0];
1492 } else {
1493 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++)
1494 fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1495 }
1496
1497 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1498 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1499 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1500 A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1501 COND(dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1502 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1503
1504 uint32_t fs_render_components = 0;
1505
1506 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8);
1507 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1508 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1509 (COND(fragdata_regid[i] & HALF_REG_ID,
1510 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1511
1512 if (VALIDREG(fragdata_regid[i])) {
1513 fs_render_components |= 0xf << (i * 4);
1514 }
1515 }
1516
1517 /* dual source blending has an extra fs output in the 2nd slot */
1518 if (dual_src_blend) {
1519 fs_render_components |= 0xf << 4;
1520 }
1521
1522 /* There is no point in having component enabled which is not written
1523 * by the shader. Per VK spec it is an UB, however a few apps depend on
1524 * attachment not being changed if FS doesn't have corresponding output.
1525 */
1526 fs_render_components &= render_components;
1527
1528 tu_cs_emit_regs(cs,
1529 A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1530
1531 tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
1532 tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1533 COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1534 COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1535 COND(dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1536 tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count));
1537
1538 tu_cs_emit_regs(cs,
1539 A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1540
1541 if (pipeline) {
1542 pipeline->lrz.fs_has_kill = fs->has_kill;
1543 pipeline->lrz.early_fragment_tests = fs->shader->nir->info.fs.early_fragment_tests;
1544
1545 if ((fs->shader && !fs->shader->nir->info.fs.early_fragment_tests) &&
1546 (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) {
1547 pipeline->lrz.force_late_z = true;
1548 }
1549 }
1550 }
1551
1552 static void
tu6_emit_geom_tess_consts(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,uint32_t cps_per_patch)1553 tu6_emit_geom_tess_consts(struct tu_cs *cs,
1554 const struct ir3_shader_variant *vs,
1555 const struct ir3_shader_variant *hs,
1556 const struct ir3_shader_variant *ds,
1557 const struct ir3_shader_variant *gs,
1558 uint32_t cps_per_patch)
1559 {
1560 uint32_t num_vertices =
1561 hs ? cps_per_patch : gs->shader->nir->info.gs.vertices_in;
1562
1563 uint32_t vs_params[4] = {
1564 vs->output_size * num_vertices * 4, /* vs primitive stride */
1565 vs->output_size * 4, /* vs vertex stride */
1566 0,
1567 0,
1568 };
1569 uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param;
1570 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0,
1571 ARRAY_SIZE(vs_params), vs_params);
1572
1573 if (hs) {
1574 assert(ds->type != MESA_SHADER_NONE);
1575 uint32_t hs_params[4] = {
1576 vs->output_size * num_vertices * 4, /* hs primitive stride */
1577 vs->output_size * 4, /* hs vertex stride */
1578 hs->output_size,
1579 cps_per_patch,
1580 };
1581
1582 uint32_t hs_base = hs->const_state->offsets.primitive_param;
1583 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0,
1584 ARRAY_SIZE(hs_params), hs_params);
1585 if (gs)
1586 num_vertices = gs->shader->nir->info.gs.vertices_in;
1587
1588 uint32_t ds_params[4] = {
1589 ds->output_size * num_vertices * 4, /* ds primitive stride */
1590 ds->output_size * 4, /* ds vertex stride */
1591 hs->output_size, /* hs vertex stride (dwords) */
1592 hs->shader->nir->info.tess.tcs_vertices_out
1593 };
1594
1595 uint32_t ds_base = ds->const_state->offsets.primitive_param;
1596 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0,
1597 ARRAY_SIZE(ds_params), ds_params);
1598 }
1599
1600 if (gs) {
1601 const struct ir3_shader_variant *prev = ds ? ds : vs;
1602 uint32_t gs_params[4] = {
1603 prev->output_size * num_vertices * 4, /* gs primitive stride */
1604 prev->output_size * 4, /* gs vertex stride */
1605 0,
1606 0,
1607 };
1608 uint32_t gs_base = gs->const_state->offsets.primitive_param;
1609 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0,
1610 ARRAY_SIZE(gs_params), gs_params);
1611 }
1612 }
1613
1614 static void
tu6_emit_program_config(struct tu_cs * cs,struct tu_pipeline_builder * builder)1615 tu6_emit_program_config(struct tu_cs *cs,
1616 struct tu_pipeline_builder *builder)
1617 {
1618 gl_shader_stage stage = MESA_SHADER_VERTEX;
1619
1620 STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1621
1622 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
1623 .vs_state = true,
1624 .hs_state = true,
1625 .ds_state = true,
1626 .gs_state = true,
1627 .fs_state = true,
1628 .gfx_ibo = true));
1629 for (; stage < ARRAY_SIZE(builder->shaders); stage++) {
1630 tu6_emit_xs_config(cs, stage, builder->variants[stage]);
1631 }
1632 }
1633
1634 static void
tu6_emit_program(struct tu_cs * cs,struct tu_pipeline_builder * builder,bool binning_pass,struct tu_pipeline * pipeline)1635 tu6_emit_program(struct tu_cs *cs,
1636 struct tu_pipeline_builder *builder,
1637 bool binning_pass,
1638 struct tu_pipeline *pipeline)
1639 {
1640 const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
1641 const struct ir3_shader_variant *bs = builder->binning_variant;
1642 const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
1643 const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
1644 const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY];
1645 const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT];
1646 gl_shader_stage stage = MESA_SHADER_VERTEX;
1647 uint32_t cps_per_patch = builder->create_info->pTessellationState ?
1648 builder->create_info->pTessellationState->patchControlPoints : 0;
1649 bool multi_pos_output = builder->shaders[MESA_SHADER_VERTEX]->multi_pos_output;
1650
1651 /* Don't use the binning pass variant when GS is present because we don't
1652 * support compiling correct binning pass variants with GS.
1653 */
1654 if (binning_pass && !gs) {
1655 vs = bs;
1656 tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova);
1657 stage++;
1658 }
1659
1660 for (; stage < ARRAY_SIZE(builder->shaders); stage++) {
1661 const struct ir3_shader_variant *xs = builder->variants[stage];
1662
1663 if (stage == MESA_SHADER_FRAGMENT && binning_pass)
1664 fs = xs = NULL;
1665
1666 tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]);
1667 }
1668
1669 uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1;
1670 uint32_t multiview_cntl = builder->multiview_mask ?
1671 A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1672 A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1673 COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1674 : 0;
1675
1676 /* Copy what the blob does here. This will emit an extra 0x3f
1677 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1678 * this is working around yet.
1679 */
1680 if (builder->device->physical_device->info->a6xx.has_cp_reg_write) {
1681 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1682 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1683 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1684 } else {
1685 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1686 }
1687 tu_cs_emit(cs, multiview_cntl);
1688
1689 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1690 tu_cs_emit(cs, multiview_cntl);
1691
1692 if (multiview_cntl &&
1693 builder->device->physical_device->info->a6xx.supports_multiview_mask) {
1694 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1695 tu_cs_emit(cs, builder->multiview_mask);
1696 }
1697
1698 tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1699 tu_cs_emit(cs, 0);
1700
1701 tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch);
1702 tu6_emit_vpc_varying_modes(cs, fs);
1703
1704 bool no_earlyz = builder->depth_attachment_format == VK_FORMAT_S8_UINT;
1705 uint32_t mrt_count = builder->color_attachment_count;
1706 uint32_t render_components = builder->render_components;
1707
1708 if (builder->alpha_to_coverage) {
1709 /* alpha to coverage can behave like a discard */
1710 no_earlyz = true;
1711 /* alpha value comes from first mrt */
1712 render_components |= 0xf;
1713 if (!mrt_count) {
1714 mrt_count = 1;
1715 /* Disable memory write for dummy mrt because it doesn't get set otherwise */
1716 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = 0));
1717 }
1718 }
1719
1720 if (fs) {
1721 tu6_emit_fs_inputs(cs, fs);
1722 tu6_emit_fs_outputs(cs, fs, mrt_count,
1723 builder->use_dual_src_blend,
1724 render_components,
1725 no_earlyz,
1726 pipeline);
1727 } else {
1728 /* TODO: check if these can be skipped if fs is disabled */
1729 struct ir3_shader_variant dummy_variant = {};
1730 tu6_emit_fs_inputs(cs, &dummy_variant);
1731 tu6_emit_fs_outputs(cs, &dummy_variant, mrt_count,
1732 builder->use_dual_src_blend,
1733 render_components,
1734 no_earlyz,
1735 NULL);
1736 }
1737
1738 if (gs || hs) {
1739 tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch);
1740 }
1741 }
1742
1743 static void
tu6_emit_vertex_input(struct tu_pipeline * pipeline,struct tu_cs * cs,const struct ir3_shader_variant * vs,const VkPipelineVertexInputStateCreateInfo * info)1744 tu6_emit_vertex_input(struct tu_pipeline *pipeline,
1745 struct tu_cs *cs,
1746 const struct ir3_shader_variant *vs,
1747 const VkPipelineVertexInputStateCreateInfo *info)
1748 {
1749 uint32_t vfd_decode_idx = 0;
1750 uint32_t binding_instanced = 0; /* bitmask of instanced bindings */
1751 uint32_t step_rate[MAX_VBS];
1752
1753 for (uint32_t i = 0; i < info->vertexBindingDescriptionCount; i++) {
1754 const VkVertexInputBindingDescription *binding =
1755 &info->pVertexBindingDescriptions[i];
1756
1757 if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
1758 tu_cs_emit_regs(cs,
1759 A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride));
1760 }
1761
1762 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
1763 binding_instanced |= 1 << binding->binding;
1764
1765 step_rate[binding->binding] = 1;
1766 }
1767
1768 const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state =
1769 vk_find_struct_const(info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
1770 if (div_state) {
1771 for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) {
1772 const VkVertexInputBindingDivisorDescriptionEXT *desc =
1773 &div_state->pVertexBindingDivisors[i];
1774 step_rate[desc->binding] = desc->divisor;
1775 }
1776 }
1777
1778 /* TODO: emit all VFD_DECODE/VFD_DEST_CNTL in same (two) pkt4 */
1779
1780 for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
1781 const VkVertexInputAttributeDescription *attr =
1782 &info->pVertexAttributeDescriptions[i];
1783 uint32_t input_idx;
1784
1785 for (input_idx = 0; input_idx < vs->inputs_count; input_idx++) {
1786 if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) == attr->location)
1787 break;
1788 }
1789
1790 /* attribute not used, skip it */
1791 if (input_idx == vs->inputs_count)
1792 continue;
1793
1794 const struct tu_native_format format = tu6_format_vtx(attr->format);
1795 tu_cs_emit_regs(cs,
1796 A6XX_VFD_DECODE_INSTR(vfd_decode_idx,
1797 .idx = attr->binding,
1798 .offset = attr->offset,
1799 .instanced = binding_instanced & (1 << attr->binding),
1800 .format = format.fmt,
1801 .swap = format.swap,
1802 .unk30 = 1,
1803 ._float = !vk_format_is_int(attr->format)),
1804 A6XX_VFD_DECODE_STEP_RATE(vfd_decode_idx, step_rate[attr->binding]));
1805
1806 tu_cs_emit_regs(cs,
1807 A6XX_VFD_DEST_CNTL_INSTR(vfd_decode_idx,
1808 .writemask = vs->inputs[input_idx].compmask,
1809 .regid = vs->inputs[input_idx].regid));
1810
1811 vfd_decode_idx++;
1812 }
1813
1814 tu_cs_emit_regs(cs,
1815 A6XX_VFD_CONTROL_0(
1816 .fetch_cnt = vfd_decode_idx, /* decode_cnt for binning pass ? */
1817 .decode_cnt = vfd_decode_idx));
1818 }
1819
1820 void
tu6_emit_viewport(struct tu_cs * cs,const VkViewport * viewports,uint32_t num_viewport)1821 tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport)
1822 {
1823 VkExtent2D guardband = {511, 511};
1824
1825 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6);
1826 for (uint32_t i = 0; i < num_viewport; i++) {
1827 const VkViewport *viewport = &viewports[i];
1828 float offsets[3];
1829 float scales[3];
1830 scales[0] = viewport->width / 2.0f;
1831 scales[1] = viewport->height / 2.0f;
1832 scales[2] = viewport->maxDepth - viewport->minDepth;
1833 offsets[0] = viewport->x + scales[0];
1834 offsets[1] = viewport->y + scales[1];
1835 offsets[2] = viewport->minDepth;
1836 for (uint32_t j = 0; j < 3; j++) {
1837 tu_cs_emit(cs, fui(offsets[j]));
1838 tu_cs_emit(cs, fui(scales[j]));
1839 }
1840
1841 guardband.width =
1842 MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
1843 guardband.height =
1844 MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
1845 }
1846
1847 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2);
1848 for (uint32_t i = 0; i < num_viewport; i++) {
1849 const VkViewport *viewport = &viewports[i];
1850 VkOffset2D min;
1851 VkOffset2D max;
1852 min.x = (int32_t) viewport->x;
1853 max.x = (int32_t) ceilf(viewport->x + viewport->width);
1854 if (viewport->height >= 0.0f) {
1855 min.y = (int32_t) viewport->y;
1856 max.y = (int32_t) ceilf(viewport->y + viewport->height);
1857 } else {
1858 min.y = (int32_t)(viewport->y + viewport->height);
1859 max.y = (int32_t) ceilf(viewport->y);
1860 }
1861 /* the spec allows viewport->height to be 0.0f */
1862 if (min.y == max.y)
1863 max.y++;
1864 /* allow viewport->width = 0.0f for un-initialized viewports: */
1865 if (min.x == max.x)
1866 max.x++;
1867
1868 min.x = MAX2(min.x, 0);
1869 min.y = MAX2(min.y, 0);
1870
1871 assert(min.x < max.x);
1872 assert(min.y < max.y);
1873 tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
1874 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
1875 tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(max.x - 1) |
1876 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(max.y - 1));
1877 }
1878
1879 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2);
1880 for (uint32_t i = 0; i < num_viewport; i++) {
1881 const VkViewport *viewport = &viewports[i];
1882 tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
1883 tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
1884 }
1885 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
1886 tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
1887 A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
1888
1889 /* TODO: what to do about this and multi viewport ? */
1890 float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
1891 float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
1892
1893 tu_cs_emit_regs(cs,
1894 A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
1895 A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
1896 }
1897
1898 void
tu6_emit_scissor(struct tu_cs * cs,const VkRect2D * scissors,uint32_t scissor_count)1899 tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count)
1900 {
1901 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2);
1902
1903 for (uint32_t i = 0; i < scissor_count; i++) {
1904 const VkRect2D *scissor = &scissors[i];
1905
1906 uint32_t min_x = scissor->offset.x;
1907 uint32_t min_y = scissor->offset.y;
1908 uint32_t max_x = min_x + scissor->extent.width - 1;
1909 uint32_t max_y = min_y + scissor->extent.height - 1;
1910
1911 if (!scissor->extent.width || !scissor->extent.height) {
1912 min_x = min_y = 1;
1913 max_x = max_y = 0;
1914 } else {
1915 /* avoid overflow */
1916 uint32_t scissor_max = BITFIELD_MASK(15);
1917 min_x = MIN2(scissor_max, min_x);
1918 min_y = MIN2(scissor_max, min_y);
1919 max_x = MIN2(scissor_max, max_x);
1920 max_y = MIN2(scissor_max, max_y);
1921 }
1922
1923 tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
1924 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
1925 tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
1926 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
1927 }
1928 }
1929
1930 void
tu6_emit_sample_locations(struct tu_cs * cs,const VkSampleLocationsInfoEXT * samp_loc)1931 tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc)
1932 {
1933 if (!samp_loc) {
1934 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
1935 tu_cs_emit(cs, 0);
1936
1937 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
1938 tu_cs_emit(cs, 0);
1939
1940 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
1941 tu_cs_emit(cs, 0);
1942 return;
1943 }
1944
1945 assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount);
1946 assert(samp_loc->sampleLocationGridSize.width == 1);
1947 assert(samp_loc->sampleLocationGridSize.height == 1);
1948
1949 uint32_t sample_config =
1950 A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE;
1951 uint32_t sample_locations = 0;
1952 for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) {
1953 sample_locations |=
1954 (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) |
1955 A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8;
1956 }
1957
1958 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2);
1959 tu_cs_emit(cs, sample_config);
1960 tu_cs_emit(cs, sample_locations);
1961
1962 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2);
1963 tu_cs_emit(cs, sample_config);
1964 tu_cs_emit(cs, sample_locations);
1965
1966 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2);
1967 tu_cs_emit(cs, sample_config);
1968 tu_cs_emit(cs, sample_locations);
1969 }
1970
1971 static uint32_t
tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo * rast_info,enum a5xx_line_mode line_mode,bool multiview)1972 tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info,
1973 enum a5xx_line_mode line_mode,
1974 bool multiview)
1975 {
1976 uint32_t gras_su_cntl = 0;
1977
1978 if (rast_info->cullMode & VK_CULL_MODE_FRONT_BIT)
1979 gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;
1980 if (rast_info->cullMode & VK_CULL_MODE_BACK_BIT)
1981 gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;
1982
1983 if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE)
1984 gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
1985
1986 gras_su_cntl |=
1987 A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f);
1988
1989 if (rast_info->depthBiasEnable)
1990 gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
1991
1992 gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode);
1993
1994 if (multiview) {
1995 gras_su_cntl |=
1996 A6XX_GRAS_SU_CNTL_UNK17 |
1997 A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE;
1998 }
1999
2000 return gras_su_cntl;
2001 }
2002
2003 void
tu6_emit_depth_bias(struct tu_cs * cs,float constant_factor,float clamp,float slope_factor)2004 tu6_emit_depth_bias(struct tu_cs *cs,
2005 float constant_factor,
2006 float clamp,
2007 float slope_factor)
2008 {
2009 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
2010 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value);
2011 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value);
2012 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value);
2013 }
2014
2015 static uint32_t
tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState * att,bool has_alpha)2016 tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att,
2017 bool has_alpha)
2018 {
2019 const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->colorBlendOp);
2020 const enum adreno_rb_blend_factor src_color_factor = tu6_blend_factor(
2021 has_alpha ? att->srcColorBlendFactor
2022 : tu_blend_factor_no_dst_alpha(att->srcColorBlendFactor));
2023 const enum adreno_rb_blend_factor dst_color_factor = tu6_blend_factor(
2024 has_alpha ? att->dstColorBlendFactor
2025 : tu_blend_factor_no_dst_alpha(att->dstColorBlendFactor));
2026 const enum a3xx_rb_blend_opcode alpha_op = tu6_blend_op(att->alphaBlendOp);
2027 const enum adreno_rb_blend_factor src_alpha_factor =
2028 tu6_blend_factor(att->srcAlphaBlendFactor);
2029 const enum adreno_rb_blend_factor dst_alpha_factor =
2030 tu6_blend_factor(att->dstAlphaBlendFactor);
2031
2032 return A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(src_color_factor) |
2033 A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(color_op) |
2034 A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(dst_color_factor) |
2035 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(src_alpha_factor) |
2036 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(alpha_op) |
2037 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(dst_alpha_factor);
2038 }
2039
2040 static uint32_t
tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState * att,uint32_t rb_mrt_control_rop,bool has_alpha)2041 tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att,
2042 uint32_t rb_mrt_control_rop,
2043 bool has_alpha)
2044 {
2045 uint32_t rb_mrt_control =
2046 A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask);
2047
2048 rb_mrt_control |= rb_mrt_control_rop;
2049
2050 if (att->blendEnable) {
2051 rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND;
2052
2053 if (has_alpha)
2054 rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND2;
2055 }
2056
2057 return rb_mrt_control;
2058 }
2059
2060 static void
tu6_emit_rb_mrt_controls(struct tu_cs * cs,const VkPipelineColorBlendStateCreateInfo * blend_info,const VkFormat attachment_formats[MAX_RTS],uint32_t * blend_enable_mask)2061 tu6_emit_rb_mrt_controls(struct tu_cs *cs,
2062 const VkPipelineColorBlendStateCreateInfo *blend_info,
2063 const VkFormat attachment_formats[MAX_RTS],
2064 uint32_t *blend_enable_mask)
2065 {
2066 *blend_enable_mask = 0;
2067
2068 bool rop_reads_dst = false;
2069 uint32_t rb_mrt_control_rop = 0;
2070 if (blend_info->logicOpEnable) {
2071 rop_reads_dst = tu_logic_op_reads_dst(blend_info->logicOp);
2072 rb_mrt_control_rop =
2073 A6XX_RB_MRT_CONTROL_ROP_ENABLE |
2074 A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp));
2075 }
2076
2077 for (uint32_t i = 0; i < blend_info->attachmentCount; i++) {
2078 const VkPipelineColorBlendAttachmentState *att =
2079 &blend_info->pAttachments[i];
2080 const VkFormat format = attachment_formats[i];
2081
2082 uint32_t rb_mrt_control = 0;
2083 uint32_t rb_mrt_blend_control = 0;
2084 if (format != VK_FORMAT_UNDEFINED) {
2085 const bool has_alpha = vk_format_has_alpha(format);
2086
2087 rb_mrt_control =
2088 tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha);
2089 rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha);
2090
2091 if (att->blendEnable || rop_reads_dst)
2092 *blend_enable_mask |= 1 << i;
2093 }
2094
2095 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2);
2096 tu_cs_emit(cs, rb_mrt_control);
2097 tu_cs_emit(cs, rb_mrt_blend_control);
2098 }
2099 }
2100
2101 static void
tu6_emit_blend_control(struct tu_cs * cs,uint32_t blend_enable_mask,bool dual_src_blend,const VkPipelineMultisampleStateCreateInfo * msaa_info)2102 tu6_emit_blend_control(struct tu_cs *cs,
2103 uint32_t blend_enable_mask,
2104 bool dual_src_blend,
2105 const VkPipelineMultisampleStateCreateInfo *msaa_info)
2106 {
2107 const uint32_t sample_mask =
2108 msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff)
2109 : ((1 << msaa_info->rasterizationSamples) - 1);
2110
2111 tu_cs_emit_regs(cs,
2112 A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
2113 .dual_color_in_enable = dual_src_blend,
2114 .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
2115 .unk8 = true));
2116
2117 /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
2118 tu_cs_emit_regs(cs,
2119 A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
2120 .independent_blend = true,
2121 .sample_mask = sample_mask,
2122 .dual_color_in_enable = dual_src_blend,
2123 .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
2124 .alpha_to_one = msaa_info->alphaToOneEnable));
2125 }
2126
2127 static uint32_t
calc_pvtmem_size(struct tu_device * dev,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes)2128 calc_pvtmem_size(struct tu_device *dev, struct tu_pvtmem_config *config,
2129 uint32_t pvtmem_bytes)
2130 {
2131 uint32_t per_fiber_size = ALIGN(pvtmem_bytes, 512);
2132 uint32_t per_sp_size =
2133 ALIGN(per_fiber_size * dev->physical_device->info->a6xx.fibers_per_sp, 1 << 12);
2134
2135 if (config) {
2136 config->per_fiber_size = per_fiber_size;
2137 config->per_sp_size = per_sp_size;
2138 }
2139
2140 return dev->physical_device->info->num_sp_cores * per_sp_size;
2141 }
2142
2143 static VkResult
tu_setup_pvtmem(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes,bool per_wave)2144 tu_setup_pvtmem(struct tu_device *dev,
2145 struct tu_pipeline *pipeline,
2146 struct tu_pvtmem_config *config,
2147 uint32_t pvtmem_bytes, bool per_wave)
2148 {
2149 if (!pvtmem_bytes) {
2150 memset(config, 0, sizeof(*config));
2151 return VK_SUCCESS;
2152 }
2153
2154 uint32_t total_size = calc_pvtmem_size(dev, config, pvtmem_bytes);
2155 config->per_wave = per_wave;
2156
2157 VkResult result =
2158 tu_bo_init_new(dev, &pipeline->pvtmem_bo, total_size,
2159 TU_BO_ALLOC_NO_FLAGS);
2160 if (result != VK_SUCCESS)
2161 return result;
2162
2163 config->iova = pipeline->pvtmem_bo.iova;
2164
2165 return result;
2166 }
2167
2168
2169 static VkResult
tu_pipeline_allocate_cs(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pipeline_builder * builder,struct ir3_shader_variant * compute)2170 tu_pipeline_allocate_cs(struct tu_device *dev,
2171 struct tu_pipeline *pipeline,
2172 struct tu_pipeline_builder *builder,
2173 struct ir3_shader_variant *compute)
2174 {
2175 uint32_t size = 2048 + tu6_load_state_size(pipeline, compute);
2176
2177 /* graphics case: */
2178 if (builder) {
2179 uint32_t pvtmem_bytes = 0;
2180 for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) {
2181 if (builder->variants[i]) {
2182 size += builder->variants[i]->info.size / 4;
2183 pvtmem_bytes = MAX2(pvtmem_bytes, builder->variants[i]->pvtmem_size);
2184 }
2185 }
2186
2187 size += builder->binning_variant->info.size / 4;
2188 pvtmem_bytes = MAX2(pvtmem_bytes, builder->binning_variant->pvtmem_size);
2189
2190 size += calc_pvtmem_size(dev, NULL, pvtmem_bytes) / 4;
2191
2192 builder->additional_cs_reserve_size = 0;
2193 for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) {
2194 struct ir3_shader_variant *variant = builder->variants[i];
2195 if (variant) {
2196 builder->additional_cs_reserve_size +=
2197 tu_xs_get_additional_cs_size_dwords(variant);
2198
2199 if (variant->binning) {
2200 builder->additional_cs_reserve_size +=
2201 tu_xs_get_additional_cs_size_dwords(variant->binning);
2202 }
2203 }
2204 }
2205
2206 size += builder->additional_cs_reserve_size;
2207 } else {
2208 size += compute->info.size / 4;
2209 size += calc_pvtmem_size(dev, NULL, compute->pvtmem_size) / 4;
2210
2211 size += tu_xs_get_additional_cs_size_dwords(compute);
2212 }
2213
2214 tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
2215
2216 /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note
2217 * that LOAD_STATE can potentially take up a large amount of space so we
2218 * calculate its size explicitly.
2219 */
2220 return tu_cs_reserve_space(&pipeline->cs, size);
2221 }
2222
2223 static void
tu_pipeline_shader_key_init(struct ir3_shader_key * key,const struct tu_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pipeline_info)2224 tu_pipeline_shader_key_init(struct ir3_shader_key *key,
2225 const struct tu_pipeline *pipeline,
2226 const VkGraphicsPipelineCreateInfo *pipeline_info)
2227 {
2228 for (uint32_t i = 0; i < pipeline_info->stageCount; i++) {
2229 if (pipeline_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) {
2230 key->has_gs = true;
2231 break;
2232 }
2233 }
2234
2235 if (pipeline_info->pRasterizationState->rasterizerDiscardEnable &&
2236 !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD)))
2237 return;
2238
2239 const VkPipelineMultisampleStateCreateInfo *msaa_info = pipeline_info->pMultisampleState;
2240 const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
2241 vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
2242 if (msaa_info->rasterizationSamples > 1 ||
2243 /* also set msaa key when sample location is not the default
2244 * since this affects varying interpolation */
2245 (sample_locations && sample_locations->sampleLocationsEnable)) {
2246 key->msaa = true;
2247 }
2248
2249 /* note: not actually used by ir3, just checked in tu6_emit_fs_inputs */
2250 if (msaa_info->sampleShadingEnable)
2251 key->sample_shading = true;
2252
2253 /* We set this after we compile to NIR because we need the prim mode */
2254 key->tessellation = IR3_TESS_NONE;
2255 }
2256
2257 static uint32_t
tu6_get_tessmode(struct tu_shader * shader)2258 tu6_get_tessmode(struct tu_shader* shader)
2259 {
2260 uint32_t primitive_mode = shader->ir3_shader->nir->info.tess.primitive_mode;
2261 switch (primitive_mode) {
2262 case GL_ISOLINES:
2263 return IR3_TESS_ISOLINES;
2264 case GL_TRIANGLES:
2265 return IR3_TESS_TRIANGLES;
2266 case GL_QUADS:
2267 return IR3_TESS_QUADS;
2268 case GL_NONE:
2269 return IR3_TESS_NONE;
2270 default:
2271 unreachable("bad tessmode");
2272 }
2273 }
2274
2275 static uint64_t
tu_upload_variant(struct tu_pipeline * pipeline,const struct ir3_shader_variant * variant)2276 tu_upload_variant(struct tu_pipeline *pipeline,
2277 const struct ir3_shader_variant *variant)
2278 {
2279 struct tu_cs_memory memory;
2280
2281 if (!variant)
2282 return 0;
2283
2284 /* this expects to get enough alignment because shaders are allocated first
2285 * and total size is always aligned correctly
2286 * note: an assert in tu6_emit_xs_config validates the alignment
2287 */
2288 tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
2289
2290 memcpy(memory.map, variant->bin, variant->info.size);
2291 return memory.iova;
2292 }
2293
2294 static void
tu_append_executable(struct tu_pipeline * pipeline,struct ir3_shader_variant * variant,char * nir_from_spirv)2295 tu_append_executable(struct tu_pipeline *pipeline, struct ir3_shader_variant *variant,
2296 char *nir_from_spirv)
2297 {
2298 ralloc_steal(pipeline->executables_mem_ctx, variant->disasm_info.nir);
2299 ralloc_steal(pipeline->executables_mem_ctx, variant->disasm_info.disasm);
2300
2301 struct tu_pipeline_executable exe = {
2302 .stage = variant->shader->type,
2303 .nir_from_spirv = nir_from_spirv,
2304 .nir_final = variant->disasm_info.nir,
2305 .disasm = variant->disasm_info.disasm,
2306 .stats = variant->info,
2307 .is_binning = variant->binning_pass,
2308 };
2309
2310 util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
2311 }
2312
2313 static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2314 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
2315 struct tu_pipeline *pipeline)
2316 {
2317 const struct ir3_compiler *compiler = builder->device->compiler;
2318 const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
2319 NULL
2320 };
2321 for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
2322 gl_shader_stage stage =
2323 vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
2324 stage_infos[stage] = &builder->create_info->pStages[i];
2325 }
2326
2327 struct ir3_shader_key key = {};
2328 tu_pipeline_shader_key_init(&key, pipeline, builder->create_info);
2329
2330 nir_shader *nir[ARRAY_SIZE(builder->shaders)] = { NULL };
2331
2332 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2333 stage < ARRAY_SIZE(nir); stage++) {
2334 const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2335 if (!stage_info)
2336 continue;
2337
2338 nir[stage] = tu_spirv_to_nir(builder->device, stage_info, stage);
2339 if (!nir[stage])
2340 return VK_ERROR_OUT_OF_HOST_MEMORY;
2341 }
2342
2343 if (!nir[MESA_SHADER_FRAGMENT]) {
2344 const nir_shader_compiler_options *nir_options =
2345 ir3_get_compiler_options(builder->device->compiler);
2346 nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
2347 nir_options,
2348 "noop_fs");
2349 nir[MESA_SHADER_FRAGMENT] = fs_b.shader;
2350 }
2351
2352 const bool executable_info = builder->create_info->flags &
2353 VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
2354
2355 char *nir_initial_disasm[ARRAY_SIZE(builder->shaders)] = { NULL };
2356
2357 if (executable_info) {
2358 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2359 stage < ARRAY_SIZE(nir); stage++) {
2360 if (!nir[stage])
2361 continue;
2362
2363 nir_initial_disasm[stage] =
2364 nir_shader_as_str(nir[stage], pipeline->executables_mem_ctx);
2365 }
2366 }
2367
2368 /* TODO do intra-stage linking here */
2369
2370 uint32_t desc_sets = 0;
2371 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2372 stage < ARRAY_SIZE(nir); stage++) {
2373 if (!nir[stage])
2374 continue;
2375
2376 struct tu_shader *shader =
2377 tu_shader_create(builder->device, nir[stage],
2378 builder->multiview_mask, builder->layout,
2379 builder->alloc);
2380 if (!shader)
2381 return VK_ERROR_OUT_OF_HOST_MEMORY;
2382
2383 /* In SPIR-V generated from GLSL, the primitive mode is specified in the
2384 * tessellation evaluation shader, but in SPIR-V generated from HLSL,
2385 * the mode is specified in the tessellation control shader. */
2386 if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) &&
2387 key.tessellation == IR3_TESS_NONE) {
2388 key.tessellation = tu6_get_tessmode(shader);
2389 }
2390
2391 if (stage > MESA_SHADER_TESS_CTRL) {
2392 if (stage == MESA_SHADER_FRAGMENT) {
2393 key.tcs_store_primid = key.tcs_store_primid ||
2394 (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2395 } else {
2396 key.tcs_store_primid = key.tcs_store_primid ||
2397 BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2398 }
2399 }
2400
2401 /* Keep track of the status of each shader's active descriptor sets,
2402 * which is set in tu_lower_io. */
2403 desc_sets |= shader->active_desc_sets;
2404
2405 builder->shaders[stage] = shader;
2406 }
2407 pipeline->active_desc_sets = desc_sets;
2408
2409 struct tu_shader *last_shader = builder->shaders[MESA_SHADER_GEOMETRY];
2410 if (!last_shader)
2411 last_shader = builder->shaders[MESA_SHADER_TESS_EVAL];
2412 if (!last_shader)
2413 last_shader = builder->shaders[MESA_SHADER_VERTEX];
2414
2415 uint64_t outputs_written = last_shader->ir3_shader->nir->info.outputs_written;
2416
2417 key.layer_zero = !(outputs_written & VARYING_BIT_LAYER);
2418 key.view_zero = !(outputs_written & VARYING_BIT_VIEWPORT);
2419
2420 pipeline->tess.patch_type = key.tessellation;
2421
2422 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2423 stage < ARRAY_SIZE(builder->shaders); stage++) {
2424 if (!builder->shaders[stage])
2425 continue;
2426
2427 bool created;
2428 builder->variants[stage] =
2429 ir3_shader_get_variant(builder->shaders[stage]->ir3_shader,
2430 &key, false, executable_info, &created);
2431 if (!builder->variants[stage])
2432 return VK_ERROR_OUT_OF_HOST_MEMORY;
2433 }
2434
2435 uint32_t safe_constlens = ir3_trim_constlen(builder->variants, compiler);
2436
2437 key.safe_constlen = true;
2438
2439 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2440 stage < ARRAY_SIZE(builder->shaders); stage++) {
2441 if (!builder->shaders[stage])
2442 continue;
2443
2444 if (safe_constlens & (1 << stage)) {
2445 bool created;
2446 builder->variants[stage] =
2447 ir3_shader_get_variant(builder->shaders[stage]->ir3_shader,
2448 &key, false, executable_info, &created);
2449 if (!builder->variants[stage])
2450 return VK_ERROR_OUT_OF_HOST_MEMORY;
2451 }
2452 }
2453
2454 const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX];
2455 struct ir3_shader_variant *variant;
2456
2457 if (vs->ir3_shader->stream_output.num_outputs ||
2458 !ir3_has_binning_vs(&key)) {
2459 variant = builder->variants[MESA_SHADER_VERTEX];
2460 } else {
2461 bool created;
2462 key.safe_constlen = !!(safe_constlens & (1 << MESA_SHADER_VERTEX));
2463 variant = ir3_shader_get_variant(vs->ir3_shader, &key,
2464 true, executable_info, &created);
2465 if (!variant)
2466 return VK_ERROR_OUT_OF_HOST_MEMORY;
2467 }
2468
2469 builder->binning_variant = variant;
2470
2471 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2472 stage < ARRAY_SIZE(nir); stage++) {
2473 if (builder->variants[stage]) {
2474 tu_append_executable(pipeline, builder->variants[stage],
2475 nir_initial_disasm[stage]);
2476 }
2477 }
2478
2479 if (builder->binning_variant != builder->variants[MESA_SHADER_VERTEX]) {
2480 tu_append_executable(pipeline, builder->binning_variant, NULL);
2481 }
2482
2483 return VK_SUCCESS;
2484 }
2485
2486 static void
tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2487 tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder,
2488 struct tu_pipeline *pipeline)
2489 {
2490 const VkPipelineDynamicStateCreateInfo *dynamic_info =
2491 builder->create_info->pDynamicState;
2492
2493 pipeline->gras_su_cntl_mask = ~0u;
2494 pipeline->rb_depth_cntl_mask = ~0u;
2495 pipeline->rb_stencil_cntl_mask = ~0u;
2496 pipeline->pc_raster_cntl_mask = ~0u;
2497 pipeline->vpc_unknown_9107_mask = ~0u;
2498
2499 if (!dynamic_info)
2500 return;
2501
2502 for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) {
2503 VkDynamicState state = dynamic_info->pDynamicStates[i];
2504 switch (state) {
2505 case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE:
2506 if (state == VK_DYNAMIC_STATE_LINE_WIDTH)
2507 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
2508 pipeline->dynamic_state_mask |= BIT(state);
2509 break;
2510 case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
2511 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS);
2512 break;
2513 case VK_DYNAMIC_STATE_CULL_MODE_EXT:
2514 pipeline->gras_su_cntl_mask &=
2515 ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT);
2516 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2517 break;
2518 case VK_DYNAMIC_STATE_FRONT_FACE_EXT:
2519 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
2520 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2521 break;
2522 case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT:
2523 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY);
2524 break;
2525 case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT:
2526 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE);
2527 break;
2528 case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT:
2529 pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT);
2530 break;
2531 case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT:
2532 pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR);
2533 break;
2534 case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT:
2535 pipeline->rb_depth_cntl_mask &=
2536 ~(A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
2537 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2538 break;
2539 case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT:
2540 pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2541 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2542 break;
2543 case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT:
2544 pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
2545 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2546 break;
2547 case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT:
2548 pipeline->rb_depth_cntl_mask &=
2549 ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
2550 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2551 break;
2552 case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT:
2553 pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2554 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2555 A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
2556 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
2557 break;
2558 case VK_DYNAMIC_STATE_STENCIL_OP_EXT:
2559 pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_FUNC__MASK |
2560 A6XX_RB_STENCIL_CONTROL_FAIL__MASK |
2561 A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |
2562 A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK |
2563 A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |
2564 A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |
2565 A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |
2566 A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);
2567 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
2568 break;
2569 case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT:
2570 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET;
2571 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2572 break;
2573 case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT:
2574 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE);
2575 break;
2576 case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT:
2577 pipeline->pc_raster_cntl_mask &= ~A6XX_PC_RASTER_CNTL_DISCARD;
2578 pipeline->vpc_unknown_9107_mask &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
2579 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD);
2580 break;
2581 default:
2582 assert(!"unsupported dynamic state");
2583 break;
2584 }
2585 }
2586 }
2587
2588 static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage * link,struct tu_shader * shader,struct ir3_shader_variant * v)2589 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
2590 struct tu_shader *shader,
2591 struct ir3_shader_variant *v)
2592 {
2593 link->const_state = *ir3_const_state(v);
2594 link->constlen = v->constlen;
2595 link->push_consts = shader->push_consts;
2596 }
2597
2598 static void
tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2599 tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
2600 struct tu_pipeline *pipeline)
2601 {
2602 struct tu_cs prog_cs;
2603
2604 /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
2605 * else that could depend on that state (like push constants)
2606 *
2607 * Note also that this always uses the full VS even in binning pass. The
2608 * binning pass variant has the same const layout as the full VS, and
2609 * the constlen for the VS will be the same or greater than the constlen
2610 * for the binning pass variant. It is required that the constlen state
2611 * matches between binning and draw passes, as some parts of the push
2612 * consts are emitted in state groups that are shared between the binning
2613 * and draw passes.
2614 */
2615 tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
2616 tu6_emit_program_config(&prog_cs, builder);
2617 pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2618
2619 tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
2620 tu6_emit_program(&prog_cs, builder, false, pipeline);
2621 pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2622
2623 tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
2624 tu6_emit_program(&prog_cs, builder, true, pipeline);
2625 pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2626
2627 VkShaderStageFlags stages = 0;
2628 for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
2629 stages |= builder->create_info->pStages[i].stage;
2630 }
2631 pipeline->active_stages = stages;
2632
2633 for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders); i++) {
2634 if (!builder->shaders[i])
2635 continue;
2636
2637 tu_pipeline_set_linkage(&pipeline->program.link[i],
2638 builder->shaders[i],
2639 builder->variants[i]);
2640 }
2641 }
2642
2643 static void
tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2644 tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder,
2645 struct tu_pipeline *pipeline)
2646 {
2647 const VkPipelineVertexInputStateCreateInfo *vi_info =
2648 builder->create_info->pVertexInputState;
2649 const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
2650 const struct ir3_shader_variant *bs = builder->binning_variant;
2651
2652 /* Bindings may contain holes */
2653 for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
2654 pipeline->num_vbs =
2655 MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1);
2656 }
2657
2658 struct tu_cs vi_cs;
2659 tu_cs_begin_sub_stream(&pipeline->cs,
2660 MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs);
2661 tu6_emit_vertex_input(pipeline, &vi_cs, vs, vi_info);
2662 pipeline->vi.state = tu_cs_end_draw_state(&pipeline->cs, &vi_cs);
2663
2664 if (bs) {
2665 tu_cs_begin_sub_stream(&pipeline->cs,
2666 MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs);
2667 tu6_emit_vertex_input(pipeline, &vi_cs, bs, vi_info);
2668 pipeline->vi.binning_state =
2669 tu_cs_end_draw_state(&pipeline->cs, &vi_cs);
2670 }
2671 }
2672
2673 static void
tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2674 tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder,
2675 struct tu_pipeline *pipeline)
2676 {
2677 const VkPipelineInputAssemblyStateCreateInfo *ia_info =
2678 builder->create_info->pInputAssemblyState;
2679
2680 pipeline->ia.primtype = tu6_primtype(ia_info->topology);
2681 pipeline->ia.primitive_restart = ia_info->primitiveRestartEnable;
2682 }
2683
2684 static bool
tu_pipeline_static_state(struct tu_pipeline * pipeline,struct tu_cs * cs,uint32_t id,uint32_t size)2685 tu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs,
2686 uint32_t id, uint32_t size)
2687 {
2688 assert(id < ARRAY_SIZE(pipeline->dynamic_state));
2689
2690 if (pipeline->dynamic_state_mask & BIT(id))
2691 return false;
2692
2693 pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size);
2694 return true;
2695 }
2696
2697 static void
tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2698 tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
2699 struct tu_pipeline *pipeline)
2700 {
2701 if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) ||
2702 !(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT))
2703 return;
2704
2705 const VkPipelineTessellationStateCreateInfo *tess_info =
2706 builder->create_info->pTessellationState;
2707
2708 assert(pipeline->ia.primtype == DI_PT_PATCHES0);
2709 assert(tess_info->patchControlPoints <= 32);
2710 pipeline->ia.primtype += tess_info->patchControlPoints;
2711 const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info =
2712 vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
2713 pipeline->tess.upper_left_domain_origin = !domain_info ||
2714 domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
2715 const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
2716 const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
2717 pipeline->tess.param_stride = hs->output_size * 4;
2718 pipeline->tess.hs_bo_regid = hs->const_state->offsets.primitive_param + 1;
2719 pipeline->tess.ds_bo_regid = ds->const_state->offsets.primitive_param + 1;
2720 }
2721
2722 static void
tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2723 tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder,
2724 struct tu_pipeline *pipeline)
2725 {
2726 /* The spec says:
2727 *
2728 * pViewportState is a pointer to an instance of the
2729 * VkPipelineViewportStateCreateInfo structure, and is ignored if the
2730 * pipeline has rasterization disabled."
2731 *
2732 * We leave the relevant registers stale in that case.
2733 */
2734 if (builder->rasterizer_discard)
2735 return;
2736
2737 const VkPipelineViewportStateCreateInfo *vp_info =
2738 builder->create_info->pViewportState;
2739
2740 struct tu_cs cs;
2741
2742 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount))
2743 tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount);
2744
2745 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount))
2746 tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount);
2747 }
2748
2749 static void
tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2750 tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
2751 struct tu_pipeline *pipeline)
2752 {
2753 const VkPipelineRasterizationStateCreateInfo *rast_info =
2754 builder->create_info->pRasterizationState;
2755
2756 enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode);
2757
2758 bool depth_clip_disable = rast_info->depthClampEnable;
2759
2760 const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
2761 vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
2762 if (depth_clip_state)
2763 depth_clip_disable = !depth_clip_state->depthClipEnable;
2764
2765 pipeline->line_mode = RECTANGULAR;
2766
2767 if (tu6_primtype_line(pipeline->ia.primtype)) {
2768 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state =
2769 vk_find_struct_const(rast_info->pNext,
2770 PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
2771
2772 if (rast_line_state && rast_line_state->lineRasterizationMode ==
2773 VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
2774 pipeline->line_mode = BRESENHAM;
2775 }
2776 }
2777
2778 struct tu_cs cs;
2779 uint32_t cs_size = 9 +
2780 (builder->device->physical_device->info->a6xx.has_shading_rate ? 8 : 0) +
2781 (builder->emit_msaa_state ? 11 : 0);
2782 pipeline->rast_state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size);
2783
2784 tu_cs_emit_regs(&cs,
2785 A6XX_GRAS_CL_CNTL(
2786 .znear_clip_disable = depth_clip_disable,
2787 .zfar_clip_disable = depth_clip_disable,
2788 /* TODO should this be depth_clip_disable instead? */
2789 .unk5 = rast_info->depthClampEnable,
2790 .zero_gb_scale_z = 1,
2791 .vp_clip_code_ignore = 1));
2792
2793 tu_cs_emit_regs(&cs,
2794 A6XX_VPC_POLYGON_MODE(mode));
2795
2796 tu_cs_emit_regs(&cs,
2797 A6XX_PC_POLYGON_MODE(mode));
2798
2799 /* move to hw ctx init? */
2800 tu_cs_emit_regs(&cs,
2801 A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
2802 A6XX_GRAS_SU_POINT_SIZE(1.0f));
2803
2804 if (builder->device->physical_device->info->a6xx.has_shading_rate) {
2805 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00());
2806 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10());
2807 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20());
2808 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30());
2809 }
2810
2811 /* If samples count couldn't be devised from the subpass, we should emit it here.
2812 * It happens when subpass doesn't use any color/depth attachment.
2813 */
2814 if (builder->emit_msaa_state)
2815 tu6_emit_msaa(&cs, builder->samples, pipeline->line_mode);
2816
2817 const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
2818 vk_find_struct_const(rast_info->pNext,
2819 PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
2820 unsigned stream = stream_info ? stream_info->rasterizationStream : 0;
2821
2822 pipeline->pc_raster_cntl = A6XX_PC_RASTER_CNTL_STREAM(stream);
2823 pipeline->vpc_unknown_9107 = 0;
2824 if (rast_info->rasterizerDiscardEnable) {
2825 pipeline->pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD;
2826 pipeline->vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
2827 }
2828
2829 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4)) {
2830 tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = pipeline->pc_raster_cntl));
2831 tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = pipeline->vpc_unknown_9107));
2832 }
2833
2834 pipeline->gras_su_cntl =
2835 tu6_gras_su_cntl(rast_info, pipeline->line_mode, builder->multiview_mask != 0);
2836
2837 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2))
2838 tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->gras_su_cntl));
2839
2840 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) {
2841 tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor,
2842 rast_info->depthBiasClamp,
2843 rast_info->depthBiasSlopeFactor);
2844 }
2845
2846 const struct VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_state =
2847 vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
2848 pipeline->provoking_vertex_last = provoking_vtx_state &&
2849 provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT;
2850 }
2851
2852 static void
tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2853 tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
2854 struct tu_pipeline *pipeline)
2855 {
2856 /* The spec says:
2857 *
2858 * pDepthStencilState is a pointer to an instance of the
2859 * VkPipelineDepthStencilStateCreateInfo structure, and is ignored if
2860 * the pipeline has rasterization disabled or if the subpass of the
2861 * render pass the pipeline is created against does not use a
2862 * depth/stencil attachment.
2863 */
2864 const VkPipelineDepthStencilStateCreateInfo *ds_info =
2865 builder->create_info->pDepthStencilState;
2866 const VkPipelineRasterizationStateCreateInfo *rast_info =
2867 builder->create_info->pRasterizationState;
2868 uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
2869 struct tu_cs cs;
2870
2871 if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
2872 builder->depth_attachment_format != VK_FORMAT_S8_UINT) {
2873 if (ds_info->depthTestEnable) {
2874 rb_depth_cntl |=
2875 A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
2876 A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) |
2877 A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; /* TODO: don't set for ALWAYS/NEVER */
2878
2879 if (rast_info->depthClampEnable)
2880 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE;
2881
2882 if (ds_info->depthWriteEnable)
2883 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2884 }
2885
2886 if (ds_info->depthBoundsTestEnable)
2887 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
2888
2889 if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable)
2890 tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl);
2891 } else {
2892 /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set
2893 * to 0 when this pipeline is used, as enabling depth test when there
2894 * is no depth attachment is a problem (at least for the S8_UINT case)
2895 */
2896 if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL))
2897 pipeline->rb_depth_cntl_disable = true;
2898 }
2899
2900 if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
2901 const VkStencilOpState *front = &ds_info->front;
2902 const VkStencilOpState *back = &ds_info->back;
2903
2904 rb_stencil_cntl |=
2905 A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) |
2906 A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) |
2907 A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) |
2908 A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) |
2909 A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) |
2910 A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) |
2911 A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) |
2912 A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp));
2913
2914 if (ds_info->stencilTestEnable) {
2915 rb_stencil_cntl |=
2916 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2917 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2918 A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
2919 }
2920 }
2921
2922 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
2923 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1);
2924 tu_cs_emit(&cs, rb_depth_cntl);
2925 }
2926 pipeline->rb_depth_cntl = rb_depth_cntl;
2927
2928 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) {
2929 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1);
2930 tu_cs_emit(&cs, rb_stencil_cntl);
2931 }
2932 pipeline->rb_stencil_cntl = rb_stencil_cntl;
2933
2934 /* the remaining draw states arent used if there is no d/s, leave them empty */
2935 if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED)
2936 return;
2937
2938 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) {
2939 tu_cs_emit_regs(&cs,
2940 A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds),
2941 A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds));
2942 }
2943
2944 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) {
2945 tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff,
2946 .bfmask = ds_info->back.compareMask & 0xff));
2947 }
2948
2949 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) {
2950 update_stencil_mask(&pipeline->stencil_wrmask, VK_STENCIL_FACE_FRONT_BIT, ds_info->front.writeMask);
2951 update_stencil_mask(&pipeline->stencil_wrmask, VK_STENCIL_FACE_BACK_BIT, ds_info->back.writeMask);
2952 tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = pipeline->stencil_wrmask));
2953 }
2954
2955 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) {
2956 tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff,
2957 .bfref = ds_info->back.reference & 0xff));
2958 }
2959
2960 if (builder->shaders[MESA_SHADER_FRAGMENT]) {
2961 const struct ir3_shader_variant *fs = &builder->shaders[MESA_SHADER_FRAGMENT]->ir3_shader->variants[0];
2962 if (fs->has_kill || fs->no_earlyz || fs->writes_pos) {
2963 pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
2964 }
2965 if (fs->no_earlyz || fs->writes_pos) {
2966 pipeline->lrz.force_disable_mask = TU_LRZ_FORCE_DISABLE_LRZ;
2967 }
2968 }
2969 }
2970
2971 static void
tu_pipeline_builder_parse_multisample_and_color_blend(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2972 tu_pipeline_builder_parse_multisample_and_color_blend(
2973 struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
2974 {
2975 /* The spec says:
2976 *
2977 * pMultisampleState is a pointer to an instance of the
2978 * VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
2979 * has rasterization disabled.
2980 *
2981 * Also,
2982 *
2983 * pColorBlendState is a pointer to an instance of the
2984 * VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
2985 * pipeline has rasterization disabled or if the subpass of the render
2986 * pass the pipeline is created against does not use any color
2987 * attachments.
2988 *
2989 * We leave the relevant registers stale when rasterization is disabled.
2990 */
2991 if (builder->rasterizer_discard)
2992 return;
2993
2994 static const VkPipelineColorBlendStateCreateInfo dummy_blend_info;
2995 const VkPipelineMultisampleStateCreateInfo *msaa_info =
2996 builder->create_info->pMultisampleState;
2997 const VkPipelineColorBlendStateCreateInfo *blend_info =
2998 builder->use_color_attachments ? builder->create_info->pColorBlendState
2999 : &dummy_blend_info;
3000
3001 struct tu_cs cs;
3002 pipeline->blend_state =
3003 tu_cs_draw_state(&pipeline->cs, &cs, blend_info->attachmentCount * 3 + 4);
3004
3005 uint32_t blend_enable_mask;
3006 tu6_emit_rb_mrt_controls(&cs, blend_info,
3007 builder->color_attachment_formats,
3008 &blend_enable_mask);
3009
3010 tu6_emit_blend_control(&cs, blend_enable_mask,
3011 builder->use_dual_src_blend, msaa_info);
3012
3013 assert(cs.cur == cs.end); /* validate draw state size */
3014
3015 if (blend_enable_mask) {
3016 for (int i = 0; i < blend_info->attachmentCount; i++) {
3017 VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i];
3018 /* Disable LRZ writes when blend is enabled, since the
3019 * resulting pixel value from the blend-draw
3020 * depends on an earlier draw, which LRZ in the draw pass
3021 * could early-reject if the previous blend-enabled draw wrote LRZ.
3022 *
3023 * From the PoV of LRZ, having masked color channels is
3024 * the same as having blend enabled, in that the draw will
3025 * care about the fragments from an earlier draw.
3026 *
3027 * TODO: We need to disable LRZ writes only for the binning pass.
3028 * Therefore, we need to emit it in a separate draw state. We keep
3029 * it disabled for sysmem path as well for the moment.
3030 */
3031 if (blendAttachment.blendEnable || blendAttachment.colorWriteMask != 0xf) {
3032 pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
3033 }
3034 }
3035 }
3036
3037 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) {
3038 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
3039 tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4);
3040 }
3041
3042 const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
3043 vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
3044 const VkSampleLocationsInfoEXT *samp_loc = NULL;
3045
3046 if (sample_locations && sample_locations->sampleLocationsEnable)
3047 samp_loc = &sample_locations->sampleLocationsInfo;
3048
3049 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3050 samp_loc ? 9 : 6)) {
3051 tu6_emit_sample_locations(&cs, samp_loc);
3052 }
3053 }
3054
3055 static void
tu_pipeline_finish(struct tu_pipeline * pipeline,struct tu_device * dev,const VkAllocationCallbacks * alloc)3056 tu_pipeline_finish(struct tu_pipeline *pipeline,
3057 struct tu_device *dev,
3058 const VkAllocationCallbacks *alloc)
3059 {
3060 tu_cs_finish(&pipeline->cs);
3061
3062 if (pipeline->pvtmem_bo.size)
3063 tu_bo_finish(dev, &pipeline->pvtmem_bo);
3064
3065 ralloc_free(pipeline->executables_mem_ctx);
3066 }
3067
3068 static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder * builder,struct tu_pipeline ** pipeline)3069 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
3070 struct tu_pipeline **pipeline)
3071 {
3072 VkResult result;
3073
3074 *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc,
3075 sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE);
3076 if (!*pipeline)
3077 return VK_ERROR_OUT_OF_HOST_MEMORY;
3078
3079 (*pipeline)->layout = builder->layout;
3080 (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
3081 util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
3082
3083 /* compile and upload shaders */
3084 result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
3085 if (result != VK_SUCCESS) {
3086 vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3087 return result;
3088 }
3089
3090 result = tu_pipeline_allocate_cs(builder->device, *pipeline, builder, NULL);
3091 if (result != VK_SUCCESS) {
3092 vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3093 return result;
3094 }
3095
3096 for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++)
3097 builder->shader_iova[i] = tu_upload_variant(*pipeline, builder->variants[i]);
3098
3099 builder->binning_vs_iova =
3100 tu_upload_variant(*pipeline, builder->binning_variant);
3101
3102 /* Setup private memory. Note that because we're sharing the same private
3103 * memory for all stages, all stages must use the same config, or else
3104 * fibers from one stage might overwrite fibers in another.
3105 */
3106
3107 uint32_t pvtmem_size = 0;
3108 bool per_wave = true;
3109 for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) {
3110 if (builder->variants[i]) {
3111 pvtmem_size = MAX2(pvtmem_size, builder->variants[i]->pvtmem_size);
3112 if (!builder->variants[i]->pvtmem_per_wave)
3113 per_wave = false;
3114 }
3115 }
3116
3117 if (builder->binning_variant) {
3118 pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size);
3119 if (!builder->binning_variant->pvtmem_per_wave)
3120 per_wave = false;
3121 }
3122
3123 result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem,
3124 pvtmem_size, per_wave);
3125 if (result != VK_SUCCESS) {
3126 vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3127 return result;
3128 }
3129
3130 tu_pipeline_builder_parse_dynamic(builder, *pipeline);
3131 tu_pipeline_builder_parse_shader_stages(builder, *pipeline);
3132 tu_pipeline_builder_parse_vertex_input(builder, *pipeline);
3133 tu_pipeline_builder_parse_input_assembly(builder, *pipeline);
3134 tu_pipeline_builder_parse_tessellation(builder, *pipeline);
3135 tu_pipeline_builder_parse_viewport(builder, *pipeline);
3136 tu_pipeline_builder_parse_rasterization(builder, *pipeline);
3137 tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
3138 tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
3139 tu6_emit_load_state(*pipeline, false);
3140
3141 /* we should have reserved enough space upfront such that the CS never
3142 * grows
3143 */
3144 assert((*pipeline)->cs.bo_count == 1);
3145
3146 return VK_SUCCESS;
3147 }
3148
3149 static void
tu_pipeline_builder_finish(struct tu_pipeline_builder * builder)3150 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
3151 {
3152 for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders); i++) {
3153 if (!builder->shaders[i])
3154 continue;
3155 tu_shader_destroy(builder->device, builder->shaders[i], builder->alloc);
3156 }
3157 }
3158
3159 static void
tu_pipeline_builder_init_graphics(struct tu_pipeline_builder * builder,struct tu_device * dev,struct tu_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * create_info,const VkAllocationCallbacks * alloc)3160 tu_pipeline_builder_init_graphics(
3161 struct tu_pipeline_builder *builder,
3162 struct tu_device *dev,
3163 struct tu_pipeline_cache *cache,
3164 const VkGraphicsPipelineCreateInfo *create_info,
3165 const VkAllocationCallbacks *alloc)
3166 {
3167 TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout);
3168
3169 *builder = (struct tu_pipeline_builder) {
3170 .device = dev,
3171 .cache = cache,
3172 .create_info = create_info,
3173 .alloc = alloc,
3174 .layout = layout,
3175 };
3176
3177 bool rasterizer_discard_dynamic = false;
3178 if (create_info->pDynamicState) {
3179 for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
3180 if (create_info->pDynamicState->pDynamicStates[i] ==
3181 VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT) {
3182 rasterizer_discard_dynamic = true;
3183 break;
3184 }
3185 }
3186 }
3187
3188 const struct tu_render_pass *pass =
3189 tu_render_pass_from_handle(create_info->renderPass);
3190 const struct tu_subpass *subpass =
3191 &pass->subpasses[create_info->subpass];
3192
3193 builder->multiview_mask = subpass->multiview_mask;
3194
3195 builder->rasterizer_discard =
3196 builder->create_info->pRasterizationState->rasterizerDiscardEnable &&
3197 !rasterizer_discard_dynamic;
3198
3199 /* variableMultisampleRate support */
3200 builder->emit_msaa_state = (subpass->samples == 0) && !builder->rasterizer_discard;
3201
3202 if (builder->rasterizer_discard) {
3203 builder->samples = VK_SAMPLE_COUNT_1_BIT;
3204 } else {
3205 builder->samples = create_info->pMultisampleState->rasterizationSamples;
3206 builder->alpha_to_coverage = create_info->pMultisampleState->alphaToCoverageEnable;
3207
3208 const uint32_t a = subpass->depth_stencil_attachment.attachment;
3209 builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ?
3210 pass->attachments[a].format : VK_FORMAT_UNDEFINED;
3211
3212 assert(subpass->color_count == 0 ||
3213 !create_info->pColorBlendState ||
3214 subpass->color_count == create_info->pColorBlendState->attachmentCount);
3215 builder->color_attachment_count = subpass->color_count;
3216 for (uint32_t i = 0; i < subpass->color_count; i++) {
3217 const uint32_t a = subpass->color_attachments[i].attachment;
3218 if (a == VK_ATTACHMENT_UNUSED)
3219 continue;
3220
3221 builder->color_attachment_formats[i] = pass->attachments[a].format;
3222 builder->use_color_attachments = true;
3223 builder->render_components |= 0xf << (i * 4);
3224 }
3225
3226 if (tu_blend_state_is_dual_src(create_info->pColorBlendState)) {
3227 builder->color_attachment_count++;
3228 builder->use_dual_src_blend = true;
3229 /* dual source blending has an extra fs output in the 2nd slot */
3230 if (subpass->color_attachments[0].attachment != VK_ATTACHMENT_UNUSED)
3231 builder->render_components |= 0xf << 4;
3232 }
3233 }
3234 }
3235
3236 static VkResult
tu_graphics_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)3237 tu_graphics_pipeline_create(VkDevice device,
3238 VkPipelineCache pipelineCache,
3239 const VkGraphicsPipelineCreateInfo *pCreateInfo,
3240 const VkAllocationCallbacks *pAllocator,
3241 VkPipeline *pPipeline)
3242 {
3243 TU_FROM_HANDLE(tu_device, dev, device);
3244 TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache);
3245
3246 struct tu_pipeline_builder builder;
3247 tu_pipeline_builder_init_graphics(&builder, dev, cache,
3248 pCreateInfo, pAllocator);
3249
3250 struct tu_pipeline *pipeline = NULL;
3251 VkResult result = tu_pipeline_builder_build(&builder, &pipeline);
3252 tu_pipeline_builder_finish(&builder);
3253
3254 if (result == VK_SUCCESS)
3255 *pPipeline = tu_pipeline_to_handle(pipeline);
3256 else
3257 *pPipeline = VK_NULL_HANDLE;
3258
3259 return result;
3260 }
3261
3262 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateGraphicsPipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)3263 tu_CreateGraphicsPipelines(VkDevice device,
3264 VkPipelineCache pipelineCache,
3265 uint32_t count,
3266 const VkGraphicsPipelineCreateInfo *pCreateInfos,
3267 const VkAllocationCallbacks *pAllocator,
3268 VkPipeline *pPipelines)
3269 {
3270 VkResult final_result = VK_SUCCESS;
3271
3272 for (uint32_t i = 0; i < count; i++) {
3273 VkResult result = tu_graphics_pipeline_create(device, pipelineCache,
3274 &pCreateInfos[i], pAllocator,
3275 &pPipelines[i]);
3276
3277 if (result != VK_SUCCESS)
3278 final_result = result;
3279 }
3280
3281 return final_result;
3282 }
3283
3284 static VkResult
tu_compute_pipeline_create(VkDevice device,VkPipelineCache _cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)3285 tu_compute_pipeline_create(VkDevice device,
3286 VkPipelineCache _cache,
3287 const VkComputePipelineCreateInfo *pCreateInfo,
3288 const VkAllocationCallbacks *pAllocator,
3289 VkPipeline *pPipeline)
3290 {
3291 TU_FROM_HANDLE(tu_device, dev, device);
3292 TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
3293 const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
3294 VkResult result;
3295
3296 struct tu_pipeline *pipeline;
3297
3298 *pPipeline = VK_NULL_HANDLE;
3299
3300 pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline),
3301 VK_OBJECT_TYPE_PIPELINE);
3302 if (!pipeline)
3303 return VK_ERROR_OUT_OF_HOST_MEMORY;
3304
3305 pipeline->layout = layout;
3306
3307 pipeline->executables_mem_ctx = ralloc_context(NULL);
3308 util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx);
3309
3310 struct ir3_shader_key key = {};
3311
3312 nir_shader *nir = tu_spirv_to_nir(dev, stage_info, MESA_SHADER_COMPUTE);
3313
3314 const bool executable_info = pCreateInfo->flags &
3315 VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
3316
3317 char *nir_initial_disasm = executable_info ?
3318 nir_shader_as_str(nir, pipeline->executables_mem_ctx) : NULL;
3319
3320 struct tu_shader *shader =
3321 tu_shader_create(dev, nir, 0, layout, pAllocator);
3322 if (!shader) {
3323 result = VK_ERROR_OUT_OF_HOST_MEMORY;
3324 goto fail;
3325 }
3326
3327 pipeline->active_desc_sets = shader->active_desc_sets;
3328
3329 bool created;
3330 struct ir3_shader_variant *v =
3331 ir3_shader_get_variant(shader->ir3_shader, &key, false, executable_info, &created);
3332 if (!v) {
3333 result = VK_ERROR_OUT_OF_HOST_MEMORY;
3334 goto fail;
3335 }
3336
3337 tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE],
3338 shader, v);
3339
3340 result = tu_pipeline_allocate_cs(dev, pipeline, NULL, v);
3341 if (result != VK_SUCCESS)
3342 goto fail;
3343
3344 uint64_t shader_iova = tu_upload_variant(pipeline, v);
3345
3346 struct tu_pvtmem_config pvtmem;
3347 tu_setup_pvtmem(dev, pipeline, &pvtmem, v->pvtmem_size, v->pvtmem_per_wave);
3348
3349 for (int i = 0; i < 3; i++)
3350 pipeline->compute.local_size[i] = v->local_size[i];
3351
3352 pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64;
3353
3354 struct tu_cs prog_cs;
3355 uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v);
3356 tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs);
3357 tu6_emit_cs_config(&prog_cs, shader, v, &pvtmem, shader_iova);
3358 pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
3359
3360 tu6_emit_load_state(pipeline, true);
3361
3362 tu_append_executable(pipeline, v, nir_initial_disasm);
3363
3364 tu_shader_destroy(dev, shader, pAllocator);
3365
3366 *pPipeline = tu_pipeline_to_handle(pipeline);
3367
3368 return VK_SUCCESS;
3369
3370 fail:
3371 if (shader)
3372 tu_shader_destroy(dev, shader, pAllocator);
3373
3374 vk_object_free(&dev->vk, pAllocator, pipeline);
3375
3376 return result;
3377 }
3378
3379 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateComputePipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)3380 tu_CreateComputePipelines(VkDevice device,
3381 VkPipelineCache pipelineCache,
3382 uint32_t count,
3383 const VkComputePipelineCreateInfo *pCreateInfos,
3384 const VkAllocationCallbacks *pAllocator,
3385 VkPipeline *pPipelines)
3386 {
3387 VkResult final_result = VK_SUCCESS;
3388
3389 for (uint32_t i = 0; i < count; i++) {
3390 VkResult result = tu_compute_pipeline_create(device, pipelineCache,
3391 &pCreateInfos[i],
3392 pAllocator, &pPipelines[i]);
3393 if (result != VK_SUCCESS)
3394 final_result = result;
3395 }
3396
3397 return final_result;
3398 }
3399
3400 VKAPI_ATTR void VKAPI_CALL
tu_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)3401 tu_DestroyPipeline(VkDevice _device,
3402 VkPipeline _pipeline,
3403 const VkAllocationCallbacks *pAllocator)
3404 {
3405 TU_FROM_HANDLE(tu_device, dev, _device);
3406 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
3407
3408 if (!_pipeline)
3409 return;
3410
3411 tu_pipeline_finish(pipeline, dev, pAllocator);
3412 vk_object_free(&dev->vk, pAllocator, pipeline);
3413 }
3414
3415 #define WRITE_STR(field, ...) ({ \
3416 memset(field, 0, sizeof(field)); \
3417 UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
3418 assert(_i > 0 && _i < sizeof(field)); \
3419 })
3420
3421 static const struct tu_pipeline_executable *
tu_pipeline_get_executable(struct tu_pipeline * pipeline,uint32_t index)3422 tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
3423 {
3424 assert(index < util_dynarray_num_elements(&pipeline->executables,
3425 struct tu_pipeline_executable));
3426 return util_dynarray_element(
3427 &pipeline->executables, struct tu_pipeline_executable, index);
3428 }
3429
3430 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)3431 tu_GetPipelineExecutablePropertiesKHR(
3432 VkDevice _device,
3433 const VkPipelineInfoKHR* pPipelineInfo,
3434 uint32_t* pExecutableCount,
3435 VkPipelineExecutablePropertiesKHR* pProperties)
3436 {
3437 TU_FROM_HANDLE(tu_device, dev, _device);
3438 TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
3439 VK_OUTARRAY_MAKE(out, pProperties, pExecutableCount);
3440
3441 util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
3442 vk_outarray_append(&out, props) {
3443 gl_shader_stage stage = exe->stage;
3444 props->stages = mesa_to_vk_shader_stage(stage);
3445
3446 if (!exe->is_binning)
3447 WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
3448 else
3449 WRITE_STR(props->name, "Binning VS");
3450
3451 WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
3452
3453 props->subgroupSize =
3454 dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
3455 }
3456 }
3457
3458 return vk_outarray_status(&out);
3459 }
3460
3461 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)3462 tu_GetPipelineExecutableStatisticsKHR(
3463 VkDevice _device,
3464 const VkPipelineExecutableInfoKHR* pExecutableInfo,
3465 uint32_t* pStatisticCount,
3466 VkPipelineExecutableStatisticKHR* pStatistics)
3467 {
3468 TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
3469 VK_OUTARRAY_MAKE(out, pStatistics, pStatisticCount);
3470
3471 const struct tu_pipeline_executable *exe =
3472 tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
3473
3474 vk_outarray_append(&out, stat) {
3475 WRITE_STR(stat->name, "Max Waves Per Core");
3476 WRITE_STR(stat->description,
3477 "Maximum number of simultaneous waves per core.");
3478 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3479 stat->value.u64 = exe->stats.max_waves;
3480 }
3481
3482 vk_outarray_append(&out, stat) {
3483 WRITE_STR(stat->name, "Instruction Count");
3484 WRITE_STR(stat->description,
3485 "Total number of IR3 instructions in the final generated "
3486 "shader executable.");
3487 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3488 stat->value.u64 = exe->stats.instrs_count;
3489 }
3490
3491 vk_outarray_append(&out, stat) {
3492 WRITE_STR(stat->name, "NOPs Count");
3493 WRITE_STR(stat->description,
3494 "Number of NOP instructions in the final generated "
3495 "shader executable.");
3496 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3497 stat->value.u64 = exe->stats.nops_count;
3498 }
3499
3500 vk_outarray_append(&out, stat) {
3501 WRITE_STR(stat->name, "MOV Count");
3502 WRITE_STR(stat->description,
3503 "Number of MOV instructions in the final generated "
3504 "shader executable.");
3505 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3506 stat->value.u64 = exe->stats.mov_count;
3507 }
3508
3509 vk_outarray_append(&out, stat) {
3510 WRITE_STR(stat->name, "COV Count");
3511 WRITE_STR(stat->description,
3512 "Number of COV instructions in the final generated "
3513 "shader executable.");
3514 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3515 stat->value.u64 = exe->stats.cov_count;
3516 }
3517
3518 vk_outarray_append(&out, stat) {
3519 WRITE_STR(stat->name, "Registers used");
3520 WRITE_STR(stat->description,
3521 "Number of registers used in the final generated "
3522 "shader executable.");
3523 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3524 stat->value.u64 = exe->stats.max_reg + 1;
3525 }
3526
3527 vk_outarray_append(&out, stat) {
3528 WRITE_STR(stat->name, "Half-registers used");
3529 WRITE_STR(stat->description,
3530 "Number of half-registers used in the final generated "
3531 "shader executable.");
3532 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3533 stat->value.u64 = exe->stats.max_half_reg + 1;
3534 }
3535
3536 vk_outarray_append(&out, stat) {
3537 WRITE_STR(stat->name, "Instructions with SS sync bit");
3538 WRITE_STR(stat->description,
3539 "SS bit is set for instructions which depend on a result "
3540 "of \"long\" instructions to prevent RAW hazard.");
3541 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3542 stat->value.u64 = exe->stats.ss;
3543 }
3544
3545 vk_outarray_append(&out, stat) {
3546 WRITE_STR(stat->name, "Instructions with SY sync bit");
3547 WRITE_STR(stat->description,
3548 "SY bit is set for instructions which depend on a result "
3549 "of loads from global memory to prevent RAW hazard.");
3550 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3551 stat->value.u64 = exe->stats.sy;
3552 }
3553
3554 vk_outarray_append(&out, stat) {
3555 WRITE_STR(stat->name, "Estimated cycles stalled on SS");
3556 WRITE_STR(stat->description,
3557 "A better metric to estimate the impact of SS syncs.");
3558 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3559 stat->value.u64 = exe->stats.sstall;
3560 }
3561
3562 for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
3563 vk_outarray_append(&out, stat) {
3564 WRITE_STR(stat->name, "cat%d instructions", i);
3565 WRITE_STR(stat->description,
3566 "Number of cat%d instructions.", i);
3567 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3568 stat->value.u64 = exe->stats.instrs_per_cat[i];
3569 }
3570 }
3571
3572 vk_outarray_append(&out, stat) {
3573 WRITE_STR(stat->name, "STP Count");
3574 WRITE_STR(stat->description,
3575 "Number of STore Private instructions in the final generated "
3576 "shader executable.");
3577 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3578 stat->value.u64 = exe->stats.stp_count;
3579 }
3580
3581 vk_outarray_append(&out, stat) {
3582 WRITE_STR(stat->name, "LDP Count");
3583 WRITE_STR(stat->description,
3584 "Number of LoaD Private instructions in the final generated "
3585 "shader executable.");
3586 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3587 stat->value.u64 = exe->stats.ldp_count;
3588 }
3589
3590 return vk_outarray_status(&out);
3591 }
3592
3593 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)3594 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
3595 const char *data)
3596 {
3597 ir->isText = VK_TRUE;
3598
3599 size_t data_len = strlen(data) + 1;
3600
3601 if (ir->pData == NULL) {
3602 ir->dataSize = data_len;
3603 return true;
3604 }
3605
3606 strncpy(ir->pData, data, ir->dataSize);
3607 if (ir->dataSize < data_len)
3608 return false;
3609
3610 ir->dataSize = data_len;
3611 return true;
3612 }
3613
3614 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)3615 tu_GetPipelineExecutableInternalRepresentationsKHR(
3616 VkDevice _device,
3617 const VkPipelineExecutableInfoKHR* pExecutableInfo,
3618 uint32_t* pInternalRepresentationCount,
3619 VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
3620 {
3621 TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
3622 VK_OUTARRAY_MAKE(out, pInternalRepresentations, pInternalRepresentationCount);
3623 bool incomplete_text = false;
3624
3625 const struct tu_pipeline_executable *exe =
3626 tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
3627
3628 if (exe->nir_from_spirv) {
3629 vk_outarray_append(&out, ir) {
3630 WRITE_STR(ir->name, "NIR from SPIRV");
3631 WRITE_STR(ir->description,
3632 "Initial NIR before any optimizations");
3633
3634 if (!write_ir_text(ir, exe->nir_from_spirv))
3635 incomplete_text = true;
3636 }
3637 }
3638
3639 if (exe->nir_final) {
3640 vk_outarray_append(&out, ir) {
3641 WRITE_STR(ir->name, "Final NIR");
3642 WRITE_STR(ir->description,
3643 "Final NIR before going into the back-end compiler");
3644
3645 if (!write_ir_text(ir, exe->nir_final))
3646 incomplete_text = true;
3647 }
3648 }
3649
3650 if (exe->disasm) {
3651 vk_outarray_append(&out, ir) {
3652 WRITE_STR(ir->name, "IR3 Assembly");
3653 WRITE_STR(ir->description,
3654 "Final IR3 assembly for the generated shader binary");
3655
3656 if (!write_ir_text(ir, exe->disasm))
3657 incomplete_text = true;
3658 }
3659 }
3660
3661 return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
3662 }
3663