1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25  * DEALINGS IN THE SOFTWARE.
26  */
27 
28 #include "common/freedreno_guardband.h"
29 #include "tu_private.h"
30 
31 #include "ir3/ir3_nir.h"
32 #include "main/menums.h"
33 #include "nir/nir.h"
34 #include "nir/nir_builder.h"
35 #include "spirv/nir_spirv.h"
36 #include "util/debug.h"
37 #include "util/mesa-sha1.h"
38 #include "util/u_atomic.h"
39 #include "vk_format.h"
40 #include "vk_util.h"
41 
42 #include "tu_cs.h"
43 
44 /* Emit IB that preloads the descriptors that the shader uses */
45 
46 static void
emit_load_state(struct tu_cs * cs,unsigned opcode,enum a6xx_state_type st,enum a6xx_state_block sb,unsigned base,unsigned offset,unsigned count)47 emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
48                 enum a6xx_state_block sb, unsigned base, unsigned offset,
49                 unsigned count)
50 {
51    /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
52     * clear if emitting more packets will even help anything. Presumably the
53     * descriptor cache is relatively small, and these packets stop doing
54     * anything when there are too many descriptors.
55     */
56    tu_cs_emit_pkt7(cs, opcode, 3);
57    tu_cs_emit(cs,
58               CP_LOAD_STATE6_0_STATE_TYPE(st) |
59               CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
60               CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
61               CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
62    tu_cs_emit_qw(cs, offset | (base << 28));
63 }
64 
65 static unsigned
tu6_load_state_size(struct tu_pipeline * pipeline,bool compute)66 tu6_load_state_size(struct tu_pipeline *pipeline, bool compute)
67 {
68    const unsigned load_state_size = 4;
69    unsigned size = 0;
70    for (unsigned i = 0; i < pipeline->layout->num_sets; i++) {
71       if (!(pipeline->active_desc_sets & (1u << i)))
72          continue;
73 
74       struct tu_descriptor_set_layout *set_layout = pipeline->layout->set[i].layout;
75       for (unsigned j = 0; j < set_layout->binding_count; j++) {
76          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
77          unsigned count = 0;
78          /* Note: some users, like amber for example, pass in
79           * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
80           * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
81           */
82          VkShaderStageFlags stages = compute ?
83             binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
84             binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
85          unsigned stage_count = util_bitcount(stages);
86 
87          if (!binding->array_size)
88             continue;
89 
90          switch (binding->type) {
91          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
92          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
93          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
94          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
95             /* IBO-backed resources only need one packet for all graphics stages */
96             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT)
97                count += 1;
98             if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
99                count += 1;
100             break;
101          case VK_DESCRIPTOR_TYPE_SAMPLER:
102          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
103          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
104          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
105          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
106             /* Textures and UBO's needs a packet for each stage */
107             count = stage_count;
108             break;
109          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
110             /* Because of how we pack combined images and samplers, we
111              * currently can't use one packet for the whole array.
112              */
113             count = stage_count * binding->array_size * 2;
114             break;
115          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
116          case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
117             break;
118          default:
119             unreachable("bad descriptor type");
120          }
121          size += count * load_state_size;
122       }
123    }
124    return size;
125 }
126 
127 static void
tu6_emit_load_state(struct tu_pipeline * pipeline,bool compute)128 tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute)
129 {
130    unsigned size = tu6_load_state_size(pipeline, compute);
131    if (size == 0)
132       return;
133 
134    struct tu_cs cs;
135    tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
136 
137    struct tu_pipeline_layout *layout = pipeline->layout;
138    for (unsigned i = 0; i < layout->num_sets; i++) {
139       /* From 13.2.7. Descriptor Set Binding:
140        *
141        *    A compatible descriptor set must be bound for all set numbers that
142        *    any shaders in a pipeline access, at the time that a draw or
143        *    dispatch command is recorded to execute using that pipeline.
144        *    However, if none of the shaders in a pipeline statically use any
145        *    bindings with a particular set number, then no descriptor set need
146        *    be bound for that set number, even if the pipeline layout includes
147        *    a non-trivial descriptor set layout for that set number.
148        *
149        * This means that descriptor sets unused by the pipeline may have a
150        * garbage or 0 BINDLESS_BASE register, which will cause context faults
151        * when prefetching descriptors from these sets. Skip prefetching for
152        * descriptors from them to avoid this. This is also an optimization,
153        * since these prefetches would be useless.
154        */
155       if (!(pipeline->active_desc_sets & (1u << i)))
156          continue;
157 
158       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
159       for (unsigned j = 0; j < set_layout->binding_count; j++) {
160          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
161          unsigned base = i;
162          unsigned offset = binding->offset / 4;
163          /* Note: some users, like amber for example, pass in
164           * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
165           * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
166           */
167          VkShaderStageFlags stages = compute ?
168             binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
169             binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
170          unsigned count = binding->array_size;
171          if (count == 0 || stages == 0)
172             continue;
173          switch (binding->type) {
174          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
175             base = MAX_SETS;
176             offset = (layout->set[i].dynamic_offset_start +
177                       binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
178             FALLTHROUGH;
179          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
180          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
181          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
182             /* IBO-backed resources only need one packet for all graphics stages */
183             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
184                emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
185                                base, offset, count);
186             }
187             if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
188                emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
189                                base, offset, count);
190             }
191             break;
192          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
193          case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
194             /* nothing - input attachment doesn't use bindless */
195             break;
196          case VK_DESCRIPTOR_TYPE_SAMPLER:
197          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
198          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
199             tu_foreach_stage(stage, stages) {
200                emit_load_state(&cs, tu6_stage2opcode(stage),
201                                binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
202                                ST6_SHADER : ST6_CONSTANTS,
203                                tu6_stage2texsb(stage), base, offset, count);
204             }
205             break;
206          }
207          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
208             base = MAX_SETS;
209             offset = (layout->set[i].dynamic_offset_start +
210                       binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
211             FALLTHROUGH;
212          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
213             tu_foreach_stage(stage, stages) {
214                emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
215                                tu6_stage2shadersb(stage), base, offset, count);
216             }
217             break;
218          }
219          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
220             tu_foreach_stage(stage, stages) {
221                /* TODO: We could emit less CP_LOAD_STATE6 if we used
222                 * struct-of-arrays instead of array-of-structs.
223                 */
224                for (unsigned i = 0; i < count; i++) {
225                   unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
226                   unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
227                   emit_load_state(&cs, tu6_stage2opcode(stage),
228                                   ST6_CONSTANTS, tu6_stage2texsb(stage),
229                                   base, tex_offset, 1);
230                   emit_load_state(&cs, tu6_stage2opcode(stage),
231                                   ST6_SHADER, tu6_stage2texsb(stage),
232                                   base, sam_offset, 1);
233                }
234             }
235             break;
236          }
237          default:
238             unreachable("bad descriptor type");
239          }
240       }
241    }
242 
243    pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
244 }
245 
246 struct tu_pipeline_builder
247 {
248    struct tu_device *device;
249    struct tu_pipeline_cache *cache;
250    struct tu_pipeline_layout *layout;
251    const VkAllocationCallbacks *alloc;
252    const VkGraphicsPipelineCreateInfo *create_info;
253 
254    struct tu_shader *shaders[MESA_SHADER_FRAGMENT + 1];
255    struct ir3_shader_variant *variants[MESA_SHADER_FRAGMENT + 1];
256    struct ir3_shader_variant *binning_variant;
257    uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1];
258    uint64_t binning_vs_iova;
259 
260    uint32_t additional_cs_reserve_size;
261 
262    struct tu_pvtmem_config pvtmem;
263 
264    bool rasterizer_discard;
265    /* these states are affectd by rasterizer_discard */
266    bool emit_msaa_state;
267    VkSampleCountFlagBits samples;
268    bool use_color_attachments;
269    bool use_dual_src_blend;
270    bool alpha_to_coverage;
271    uint32_t color_attachment_count;
272    VkFormat color_attachment_formats[MAX_RTS];
273    VkFormat depth_attachment_format;
274    uint32_t render_components;
275    uint32_t multiview_mask;
276 };
277 
278 static bool
tu_logic_op_reads_dst(VkLogicOp op)279 tu_logic_op_reads_dst(VkLogicOp op)
280 {
281    switch (op) {
282    case VK_LOGIC_OP_CLEAR:
283    case VK_LOGIC_OP_COPY:
284    case VK_LOGIC_OP_COPY_INVERTED:
285    case VK_LOGIC_OP_SET:
286       return false;
287    default:
288       return true;
289    }
290 }
291 
292 static VkBlendFactor
tu_blend_factor_no_dst_alpha(VkBlendFactor factor)293 tu_blend_factor_no_dst_alpha(VkBlendFactor factor)
294 {
295    /* treat dst alpha as 1.0 and avoid reading it */
296    switch (factor) {
297    case VK_BLEND_FACTOR_DST_ALPHA:
298       return VK_BLEND_FACTOR_ONE;
299    case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
300       return VK_BLEND_FACTOR_ZERO;
301    default:
302       return factor;
303    }
304 }
305 
tu_blend_factor_is_dual_src(VkBlendFactor factor)306 static bool tu_blend_factor_is_dual_src(VkBlendFactor factor)
307 {
308    switch (factor) {
309    case VK_BLEND_FACTOR_SRC1_COLOR:
310    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
311    case VK_BLEND_FACTOR_SRC1_ALPHA:
312    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
313       return true;
314    default:
315       return false;
316    }
317 }
318 
319 static bool
tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo * info)320 tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info)
321 {
322    if (!info)
323       return false;
324 
325    for (unsigned i = 0; i < info->attachmentCount; i++) {
326       const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i];
327       if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) ||
328           tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) ||
329           tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) ||
330           tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor))
331          return true;
332    }
333 
334    return false;
335 }
336 
337 static const struct xs_config {
338    uint16_t reg_sp_xs_ctrl;
339    uint16_t reg_sp_xs_config;
340    uint16_t reg_sp_xs_instrlen;
341    uint16_t reg_hlsq_xs_ctrl;
342    uint16_t reg_sp_xs_first_exec_offset;
343    uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
344 } xs_config[] = {
345    [MESA_SHADER_VERTEX] = {
346       REG_A6XX_SP_VS_CTRL_REG0,
347       REG_A6XX_SP_VS_CONFIG,
348       REG_A6XX_SP_VS_INSTRLEN,
349       REG_A6XX_HLSQ_VS_CNTL,
350       REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
351       REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
352    },
353    [MESA_SHADER_TESS_CTRL] = {
354       REG_A6XX_SP_HS_CTRL_REG0,
355       REG_A6XX_SP_HS_CONFIG,
356       REG_A6XX_SP_HS_INSTRLEN,
357       REG_A6XX_HLSQ_HS_CNTL,
358       REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
359       REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
360    },
361    [MESA_SHADER_TESS_EVAL] = {
362       REG_A6XX_SP_DS_CTRL_REG0,
363       REG_A6XX_SP_DS_CONFIG,
364       REG_A6XX_SP_DS_INSTRLEN,
365       REG_A6XX_HLSQ_DS_CNTL,
366       REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
367       REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
368    },
369    [MESA_SHADER_GEOMETRY] = {
370       REG_A6XX_SP_GS_CTRL_REG0,
371       REG_A6XX_SP_GS_CONFIG,
372       REG_A6XX_SP_GS_INSTRLEN,
373       REG_A6XX_HLSQ_GS_CNTL,
374       REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
375       REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
376    },
377    [MESA_SHADER_FRAGMENT] = {
378       REG_A6XX_SP_FS_CTRL_REG0,
379       REG_A6XX_SP_FS_CONFIG,
380       REG_A6XX_SP_FS_INSTRLEN,
381       REG_A6XX_HLSQ_FS_CNTL,
382       REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
383       REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
384    },
385    [MESA_SHADER_COMPUTE] = {
386       REG_A6XX_SP_CS_CTRL_REG0,
387       REG_A6XX_SP_CS_CONFIG,
388       REG_A6XX_SP_CS_INSTRLEN,
389       REG_A6XX_HLSQ_CS_CNTL,
390       REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
391       REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
392    },
393 };
394 
395 static uint32_t
tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant * xs)396 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
397 {
398    const struct ir3_const_state *const_state = ir3_const_state(xs);
399    uint32_t base = const_state->offsets.immediate;
400    int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
401 
402    /* truncate size to avoid writing constants that shader
403     * does not use:
404     */
405    size = MIN2(size + base, xs->constlen) - base;
406 
407    return MAX2(size, 0) * 4;
408 }
409 
410 /* We allocate fixed-length substreams for shader state, however some
411  * parts of the state may have unbound length. Their additional space
412  * requirements should be calculated here.
413  */
414 static uint32_t
tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant * xs)415 tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
416 {
417    uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
418    return size;
419 }
420 
421 void
tu6_emit_xs_config(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs)422 tu6_emit_xs_config(struct tu_cs *cs,
423                    gl_shader_stage stage, /* xs->type, but xs may be NULL */
424                    const struct ir3_shader_variant *xs)
425 {
426    const struct xs_config *cfg = &xs_config[stage];
427 
428    if (!xs) {
429       /* shader stage disabled */
430       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
431       tu_cs_emit(cs, 0);
432 
433       tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
434       tu_cs_emit(cs, 0);
435       return;
436    }
437 
438    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
439    tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
440                   COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
441                   COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
442                   COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
443                   COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
444                   A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
445                   A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
446 
447    tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
448    tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
449                   A6XX_HLSQ_VS_CNTL_ENABLED);
450 }
451 
452 void
tu6_emit_xs(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)453 tu6_emit_xs(struct tu_cs *cs,
454             gl_shader_stage stage, /* xs->type, but xs may be NULL */
455             const struct ir3_shader_variant *xs,
456             const struct tu_pvtmem_config *pvtmem,
457             uint64_t binary_iova)
458 {
459    const struct xs_config *cfg = &xs_config[stage];
460 
461    if (!xs) {
462       /* shader stage disabled */
463       return;
464    }
465 
466    enum a6xx_threadsize thrsz =
467       xs->info.double_threadsize ? THREAD128 : THREAD64;
468    switch (stage) {
469    case MESA_SHADER_VERTEX:
470       tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
471                .fullregfootprint = xs->info.max_reg + 1,
472                .halfregfootprint = xs->info.max_half_reg + 1,
473                .branchstack = ir3_shader_branchstack_hw(xs),
474                .mergedregs = xs->mergedregs,
475       ));
476       break;
477    case MESA_SHADER_TESS_CTRL:
478       tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
479                .fullregfootprint = xs->info.max_reg + 1,
480                .halfregfootprint = xs->info.max_half_reg + 1,
481                .branchstack = ir3_shader_branchstack_hw(xs),
482       ));
483       break;
484    case MESA_SHADER_TESS_EVAL:
485       tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
486                .fullregfootprint = xs->info.max_reg + 1,
487                .halfregfootprint = xs->info.max_half_reg + 1,
488                .branchstack = ir3_shader_branchstack_hw(xs),
489                .mergedregs = xs->mergedregs,
490       ));
491       break;
492    case MESA_SHADER_GEOMETRY:
493       tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
494                .fullregfootprint = xs->info.max_reg + 1,
495                .halfregfootprint = xs->info.max_half_reg + 1,
496                .branchstack = ir3_shader_branchstack_hw(xs),
497       ));
498       break;
499    case MESA_SHADER_FRAGMENT:
500       tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
501                .fullregfootprint = xs->info.max_reg + 1,
502                .halfregfootprint = xs->info.max_half_reg + 1,
503                .branchstack = ir3_shader_branchstack_hw(xs),
504                .mergedregs = xs->mergedregs,
505                .threadsize = thrsz,
506                .pixlodenable = xs->need_pixlod,
507                .diff_fine = xs->need_fine_derivatives,
508                .varying = xs->total_in != 0,
509                /* unknown bit, seems unnecessary */
510                .unk24 = true,
511       ));
512       break;
513    case MESA_SHADER_COMPUTE:
514       tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
515                .fullregfootprint = xs->info.max_reg + 1,
516                .halfregfootprint = xs->info.max_half_reg + 1,
517                .branchstack = ir3_shader_branchstack_hw(xs),
518                .mergedregs = xs->mergedregs,
519                .threadsize = thrsz,
520       ));
521       break;
522    default:
523       unreachable("bad shader stage");
524    }
525 
526    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
527    tu_cs_emit(cs, xs->instrlen);
528 
529    /* emit program binary & private memory layout
530     * binary_iova should be aligned to 1 instrlen unit (128 bytes)
531     */
532 
533    assert((binary_iova & 0x7f) == 0);
534    assert((pvtmem->iova & 0x1f) == 0);
535 
536    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
537    tu_cs_emit(cs, 0);
538    tu_cs_emit_qw(cs, binary_iova);
539    tu_cs_emit(cs,
540               A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
541    tu_cs_emit_qw(cs, pvtmem->iova);
542    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
543                   COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
544 
545    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
546    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
547 
548    tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
549    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
550                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
551                   CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
552                   CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
553                   CP_LOAD_STATE6_0_NUM_UNIT(xs->instrlen));
554    tu_cs_emit_qw(cs, binary_iova);
555 
556    /* emit immediates */
557 
558    const struct ir3_const_state *const_state = ir3_const_state(xs);
559    uint32_t base = const_state->offsets.immediate;
560    unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
561 
562    if (immediate_size > 0) {
563       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
564       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
565                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
566                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
567                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
568                  CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
569       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
570       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
571 
572       tu_cs_emit_array(cs, const_state->immediates, immediate_size);
573    }
574 
575    if (const_state->constant_data_ubo != -1) {
576       uint64_t iova = binary_iova + xs->info.constant_data_offset;
577 
578       /* Upload UBO state for the constant data. */
579       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
580       tu_cs_emit(cs,
581                  CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
582                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
583                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
584                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
585                  CP_LOAD_STATE6_0_NUM_UNIT(1));
586       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
587       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
588       int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
589       tu_cs_emit_qw(cs,
590                     iova |
591                     (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
592 
593       /* Upload the constant data to the const file if needed. */
594       const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
595 
596       for (int i = 0; i < ubo_state->num_enabled; i++) {
597          if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
598              ubo_state->range[i].ubo.bindless) {
599             continue;
600          }
601 
602          uint32_t start = ubo_state->range[i].start;
603          uint32_t end = ubo_state->range[i].end;
604          uint32_t size = MIN2(end - start,
605                               (16 * xs->constlen) - ubo_state->range[i].offset);
606 
607          tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
608          tu_cs_emit(cs,
609                     CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
610                     CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
611                     CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
612                     CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
613                     CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
614          tu_cs_emit_qw(cs, iova + start);
615       }
616    }
617 }
618 
619 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct tu_shader * shader,const struct ir3_shader_variant * v,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)620 tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
621                    const struct ir3_shader_variant *v,
622                    const struct tu_pvtmem_config *pvtmem,
623                    uint64_t binary_iova)
624 {
625    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
626          .cs_state = true,
627          .cs_ibo = true));
628 
629    tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v);
630    tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
631 
632    uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
633    tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
634    tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
635                   A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
636 
637    if (cs->device->physical_device->info->a6xx.has_lpac) {
638       tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
639       tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
640                      A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
641    }
642 
643    uint32_t local_invocation_id =
644       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
645    uint32_t work_group_id =
646       ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
647 
648    enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
649    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
650    tu_cs_emit(cs,
651               A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
652               A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
653               A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
654               A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
655    tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
656                   A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
657 
658    if (cs->device->physical_device->info->a6xx.has_lpac) {
659       tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
660       tu_cs_emit(cs,
661                  A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
662                  A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
663                  A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
664                  A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
665       tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
666                      A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
667    }
668 }
669 
670 static void
tu6_emit_vs_system_values(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,bool primid_passthru)671 tu6_emit_vs_system_values(struct tu_cs *cs,
672                           const struct ir3_shader_variant *vs,
673                           const struct ir3_shader_variant *hs,
674                           const struct ir3_shader_variant *ds,
675                           const struct ir3_shader_variant *gs,
676                           bool primid_passthru)
677 {
678    const uint32_t vertexid_regid =
679          ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
680    const uint32_t instanceid_regid =
681          ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
682    const uint32_t tess_coord_x_regid = hs ?
683          ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) :
684          regid(63, 0);
685    const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
686          tess_coord_x_regid + 1 :
687          regid(63, 0);
688    const uint32_t hs_rel_patch_regid = hs ?
689          ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
690          regid(63, 0);
691    const uint32_t ds_rel_patch_regid = hs ?
692          ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
693          regid(63, 0);
694    const uint32_t hs_invocation_regid = hs ?
695          ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) :
696          regid(63, 0);
697    const uint32_t gs_primitiveid_regid = gs ?
698          ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) :
699          regid(63, 0);
700    const uint32_t vs_primitiveid_regid = hs ?
701          ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) :
702          gs_primitiveid_regid;
703    const uint32_t ds_primitiveid_regid = ds ?
704          ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) :
705          regid(63, 0);
706    const uint32_t gsheader_regid = gs ?
707          ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) :
708          regid(63, 0);
709 
710    /* Note: we currently don't support multiview with tess or GS. If we did,
711     * and the HW actually works, then we'd have to somehow share this across
712     * stages. Note that the blob doesn't support this either.
713     */
714    const uint32_t viewid_regid =
715       ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
716 
717    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6);
718    tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
719                   A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
720                   A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
721                   A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
722    tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
723                   A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
724    tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
725                   A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
726                   A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
727                   A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
728    tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
729    tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
730                   0xfc00); /* VFD_CONTROL_5 */
731    tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */
732 }
733 
734 static void
tu6_setup_streamout(struct tu_cs * cs,const struct ir3_shader_variant * v,struct ir3_shader_linkage * l)735 tu6_setup_streamout(struct tu_cs *cs,
736                     const struct ir3_shader_variant *v,
737                     struct ir3_shader_linkage *l)
738 {
739    const struct ir3_stream_output_info *info = &v->shader->stream_output;
740    /* Note: 64 here comes from the HW layout of the program RAM. The program
741     * for stream N is at DWORD 64 * N.
742     */
743 #define A6XX_SO_PROG_DWORDS 64
744    uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
745    BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
746    uint32_t ncomp[IR3_MAX_SO_BUFFERS] = {};
747 
748    /* TODO: streamout state should be in a non-GMEM draw state */
749 
750    /* no streamout: */
751    if (info->num_outputs == 0) {
752       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
753       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
754       tu_cs_emit(cs, 0);
755       tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
756       tu_cs_emit(cs, 0);
757       return;
758    }
759 
760    /* is there something to do with info->stride[i]? */
761 
762    for (unsigned i = 0; i < info->num_outputs; i++) {
763       const struct ir3_stream_output *out = &info->output[i];
764       unsigned k = out->register_index;
765       unsigned idx;
766 
767       /* Skip it, if it's an output that was never assigned a register. */
768       if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
769          continue;
770 
771       ncomp[out->output_buffer] += out->num_components;
772 
773       /* linkage map sorted by order frag shader wants things, so
774        * a bit less ideal here..
775        */
776       for (idx = 0; idx < l->cnt; idx++)
777          if (l->var[idx].regid == v->outputs[k].regid)
778             break;
779 
780       debug_assert(idx < l->cnt);
781 
782       for (unsigned j = 0; j < out->num_components; j++) {
783          unsigned c   = j + out->start_component;
784          unsigned loc = l->var[idx].loc + c;
785          unsigned off = j + out->dst_offset;  /* in dwords */
786 
787          assert(loc < A6XX_SO_PROG_DWORDS * 2);
788          unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
789          if (loc & 1) {
790             prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
791                            A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
792                            A6XX_VPC_SO_PROG_B_OFF(off * 4);
793          } else {
794             prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
795                            A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
796                            A6XX_VPC_SO_PROG_A_OFF(off * 4);
797          }
798          BITSET_SET(valid_dwords, dword);
799       }
800    }
801 
802    unsigned prog_count = 0;
803    unsigned start, end;
804    BITSET_FOREACH_RANGE(start, end, valid_dwords,
805                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
806       prog_count += end - start + 1;
807    }
808 
809    tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
810    tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
811    tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
812                   COND(ncomp[0] > 0,
813                        A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
814                   COND(ncomp[1] > 0,
815                        A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
816                   COND(ncomp[2] > 0,
817                        A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
818                   COND(ncomp[3] > 0,
819                        A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
820    for (uint32_t i = 0; i < 4; i++) {
821       tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(i));
822       tu_cs_emit(cs, ncomp[i]);
823    }
824    bool first = true;
825    BITSET_FOREACH_RANGE(start, end, valid_dwords,
826                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
827       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
828       tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
829                      A6XX_VPC_SO_CNTL_ADDR(start));
830       for (unsigned i = start; i < end; i++) {
831          tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
832          tu_cs_emit(cs, prog[i]);
833       }
834       first = false;
835    }
836 }
837 
838 static void
tu6_emit_const(struct tu_cs * cs,uint32_t opcode,uint32_t base,enum a6xx_state_block block,uint32_t offset,uint32_t size,const uint32_t * dwords)839 tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base,
840                enum a6xx_state_block block, uint32_t offset,
841                uint32_t size, const uint32_t *dwords) {
842    assert(size % 4 == 0);
843 
844    tu_cs_emit_pkt7(cs, opcode, 3 + size);
845    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
846          CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
847          CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
848          CP_LOAD_STATE6_0_STATE_BLOCK(block) |
849          CP_LOAD_STATE6_0_NUM_UNIT(size / 4));
850 
851    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
852    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
853    dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
854 
855    tu_cs_emit_array(cs, dwords, size);
856 }
857 
858 static void
tu6_emit_link_map(struct tu_cs * cs,const struct ir3_shader_variant * producer,const struct ir3_shader_variant * consumer,enum a6xx_state_block sb)859 tu6_emit_link_map(struct tu_cs *cs,
860                   const struct ir3_shader_variant *producer,
861                   const struct ir3_shader_variant *consumer,
862                   enum a6xx_state_block sb)
863 {
864    const struct ir3_const_state *const_state = ir3_const_state(consumer);
865    uint32_t base = const_state->offsets.primitive_map;
866    int size = DIV_ROUND_UP(consumer->input_size, 4);
867 
868    size = (MIN2(size + base, consumer->constlen) - base) * 4;
869    if (size <= 0)
870       return;
871 
872    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size,
873                          producer->output_loc);
874 }
875 
876 static uint16_t
gl_primitive_to_tess(uint16_t primitive)877 gl_primitive_to_tess(uint16_t primitive) {
878    switch (primitive) {
879    case GL_POINTS:
880       return TESS_POINTS;
881    case GL_LINE_STRIP:
882       return TESS_LINES;
883    case GL_TRIANGLE_STRIP:
884       return TESS_CW_TRIS;
885    default:
886       unreachable("");
887    }
888 }
889 
890 void
tu6_emit_vpc(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs,uint32_t patch_control_points)891 tu6_emit_vpc(struct tu_cs *cs,
892              const struct ir3_shader_variant *vs,
893              const struct ir3_shader_variant *hs,
894              const struct ir3_shader_variant *ds,
895              const struct ir3_shader_variant *gs,
896              const struct ir3_shader_variant *fs,
897              uint32_t patch_control_points)
898 {
899    /* note: doesn't compile as static because of the array regs.. */
900    const struct reg_config {
901       uint16_t reg_sp_xs_out_reg;
902       uint16_t reg_sp_xs_vpc_dst_reg;
903       uint16_t reg_vpc_xs_pack;
904       uint16_t reg_vpc_xs_clip_cntl;
905       uint16_t reg_gras_xs_cl_cntl;
906       uint16_t reg_pc_xs_out_cntl;
907       uint16_t reg_sp_xs_primitive_cntl;
908       uint16_t reg_vpc_xs_layer_cntl;
909       uint16_t reg_gras_xs_layer_cntl;
910    } reg_config[] = {
911       [MESA_SHADER_VERTEX] = {
912          REG_A6XX_SP_VS_OUT_REG(0),
913          REG_A6XX_SP_VS_VPC_DST_REG(0),
914          REG_A6XX_VPC_VS_PACK,
915          REG_A6XX_VPC_VS_CLIP_CNTL,
916          REG_A6XX_GRAS_VS_CL_CNTL,
917          REG_A6XX_PC_VS_OUT_CNTL,
918          REG_A6XX_SP_VS_PRIMITIVE_CNTL,
919          REG_A6XX_VPC_VS_LAYER_CNTL,
920          REG_A6XX_GRAS_VS_LAYER_CNTL
921       },
922       [MESA_SHADER_TESS_CTRL] = {
923          0,
924          0,
925          0,
926          0,
927          0,
928          REG_A6XX_PC_HS_OUT_CNTL,
929          0,
930          0,
931          0
932       },
933       [MESA_SHADER_TESS_EVAL] = {
934          REG_A6XX_SP_DS_OUT_REG(0),
935          REG_A6XX_SP_DS_VPC_DST_REG(0),
936          REG_A6XX_VPC_DS_PACK,
937          REG_A6XX_VPC_DS_CLIP_CNTL,
938          REG_A6XX_GRAS_DS_CL_CNTL,
939          REG_A6XX_PC_DS_OUT_CNTL,
940          REG_A6XX_SP_DS_PRIMITIVE_CNTL,
941          REG_A6XX_VPC_DS_LAYER_CNTL,
942          REG_A6XX_GRAS_DS_LAYER_CNTL
943       },
944       [MESA_SHADER_GEOMETRY] = {
945          REG_A6XX_SP_GS_OUT_REG(0),
946          REG_A6XX_SP_GS_VPC_DST_REG(0),
947          REG_A6XX_VPC_GS_PACK,
948          REG_A6XX_VPC_GS_CLIP_CNTL,
949          REG_A6XX_GRAS_GS_CL_CNTL,
950          REG_A6XX_PC_GS_OUT_CNTL,
951          REG_A6XX_SP_GS_PRIMITIVE_CNTL,
952          REG_A6XX_VPC_GS_LAYER_CNTL,
953          REG_A6XX_GRAS_GS_LAYER_CNTL
954       },
955    };
956 
957    const struct ir3_shader_variant *last_shader;
958    if (gs) {
959       last_shader = gs;
960    } else if (hs) {
961       last_shader = ds;
962    } else {
963       last_shader = vs;
964    }
965 
966    const struct reg_config *cfg = &reg_config[last_shader->type];
967 
968    struct ir3_shader_linkage linkage = {
969       .primid_loc = 0xff,
970       .clip0_loc = 0xff,
971       .clip1_loc = 0xff,
972    };
973    if (fs)
974       ir3_link_shaders(&linkage, last_shader, fs, true);
975 
976    if (last_shader->shader->stream_output.num_outputs)
977       ir3_link_stream_out(&linkage, last_shader);
978 
979    /* We do this after linking shaders in order to know whether PrimID
980     * passthrough needs to be enabled.
981     */
982    bool primid_passthru = linkage.primid_loc != 0xff;
983    tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru);
984 
985    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
986    tu_cs_emit(cs, ~linkage.varmask[0]);
987    tu_cs_emit(cs, ~linkage.varmask[1]);
988    tu_cs_emit(cs, ~linkage.varmask[2]);
989    tu_cs_emit(cs, ~linkage.varmask[3]);
990 
991    /* a6xx finds position/pointsize at the end */
992    const uint32_t pointsize_regid =
993       ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
994    const uint32_t layer_regid =
995       ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
996    const uint32_t view_regid =
997       ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
998    const uint32_t clip0_regid =
999       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
1000    const uint32_t clip1_regid =
1001       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
1002    uint32_t flags_regid = gs ?
1003       ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
1004 
1005    uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
1006 
1007    if (layer_regid != regid(63, 0)) {
1008       layer_loc = linkage.max_loc;
1009       ir3_link_add(&linkage, layer_regid, 0x1, linkage.max_loc);
1010    }
1011 
1012    if (view_regid != regid(63, 0)) {
1013       view_loc = linkage.max_loc;
1014       ir3_link_add(&linkage, view_regid, 0x1, linkage.max_loc);
1015    }
1016 
1017    unsigned extra_pos = 0;
1018 
1019    for (unsigned i = 0; i < last_shader->outputs_count; i++) {
1020       if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
1021          continue;
1022 
1023       if (position_loc == 0xff)
1024          position_loc = linkage.max_loc;
1025 
1026       ir3_link_add(&linkage, last_shader->outputs[i].regid,
1027                    0xf, position_loc + 4 * last_shader->outputs[i].view);
1028       extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
1029    }
1030 
1031    if (pointsize_regid != regid(63, 0)) {
1032       pointsize_loc = linkage.max_loc;
1033       ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc);
1034    }
1035 
1036    uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
1037 
1038    /* Handle the case where clip/cull distances aren't read by the FS */
1039    uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
1040    if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
1041       clip0_loc = linkage.max_loc;
1042       ir3_link_add(&linkage, clip0_regid, clip_cull_mask & 0xf, linkage.max_loc);
1043    }
1044    if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
1045       clip1_loc = linkage.max_loc;
1046       ir3_link_add(&linkage, clip1_regid, clip_cull_mask >> 4, linkage.max_loc);
1047    }
1048 
1049    tu6_setup_streamout(cs, last_shader, &linkage);
1050 
1051    /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
1052     * at least when a DS is the last stage, so add a dummy output to keep it
1053     * happy if there aren't any. We do this late in order to avoid emitting
1054     * any unused code and make sure that optimizations don't remove it.
1055     */
1056    if (linkage.cnt == 0)
1057       ir3_link_add(&linkage, 0, 0x1, linkage.max_loc);
1058 
1059    /* map outputs of the last shader to VPC */
1060    assert(linkage.cnt <= 32);
1061    const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
1062    const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
1063    uint32_t sp_out[16] = {0};
1064    uint32_t sp_vpc_dst[8] = {0};
1065    for (uint32_t i = 0; i < linkage.cnt; i++) {
1066       ((uint16_t *) sp_out)[i] =
1067          A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
1068          A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
1069       ((uint8_t *) sp_vpc_dst)[i] =
1070          A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
1071    }
1072 
1073    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
1074    tu_cs_emit_array(cs, sp_out, sp_out_count);
1075 
1076    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
1077    tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
1078 
1079    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
1080    tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
1081                   A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
1082                   A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
1083                   A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
1084 
1085    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
1086    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1087                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1088                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1089 
1090    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
1091    tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
1092                   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
1093 
1094    const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
1095 
1096    for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
1097       const struct ir3_shader_variant *shader = geom_shaders[i];
1098       if (!shader)
1099          continue;
1100 
1101       bool primid = shader->type != MESA_SHADER_VERTEX &&
1102          VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
1103 
1104       tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
1105       if (shader == last_shader) {
1106          tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
1107                         CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
1108                         CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
1109                         CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
1110                         COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
1111                         A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
1112       } else {
1113          tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
1114       }
1115    }
1116 
1117    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
1118    tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
1119                   A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
1120 
1121    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
1122    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1123                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc));
1124 
1125    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
1126    tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
1127                   CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
1128 
1129    tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru));
1130 
1131    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1132    tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) |
1133                   COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1134                   A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) |
1135                   A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc));
1136 
1137    if (hs) {
1138       shader_info *hs_info = &hs->shader->nir->info;
1139 
1140       tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1141       tu_cs_emit(cs, hs_info->tess.tcs_vertices_out);
1142 
1143       /* Total attribute slots in HS incoming patch. */
1144       tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1145       tu_cs_emit(cs, patch_control_points * vs->output_size / 4);
1146 
1147       const uint32_t wavesize = 64;
1148       const uint32_t max_wave_input_size = 64;
1149 
1150       /* note: if HS is really just the VS extended, then this
1151        * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out)
1152        * however that doesn't match the blob, and fails some dEQP tests.
1153        */
1154       uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out;
1155       uint32_t max_prims_per_wave =
1156          max_wave_input_size * wavesize / (vs->output_size * patch_control_points);
1157       prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave);
1158 
1159       uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave;
1160       uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize);
1161 
1162       tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1163       tu_cs_emit(cs, wave_input_size);
1164 
1165       /* In SPIR-V generated from GLSL, the tessellation primitive params are
1166        * are specified in the tess eval shader, but in SPIR-V generated from
1167        * HLSL, they are specified in the tess control shader. */
1168       shader_info *tess_info =
1169             ds->shader->nir->info.tess.spacing == TESS_SPACING_UNSPECIFIED ?
1170             &hs->shader->nir->info : &ds->shader->nir->info;
1171       tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1);
1172       uint32_t output;
1173       if (tess_info->tess.point_mode)
1174          output = TESS_POINTS;
1175       else if (tess_info->tess.primitive_mode == GL_ISOLINES)
1176          output = TESS_LINES;
1177       else if (tess_info->tess.ccw)
1178          output = TESS_CCW_TRIS;
1179       else
1180          output = TESS_CW_TRIS;
1181 
1182       enum a6xx_tess_spacing spacing;
1183       switch (tess_info->tess.spacing) {
1184       case TESS_SPACING_EQUAL:
1185          spacing = TESS_EQUAL;
1186          break;
1187       case TESS_SPACING_FRACTIONAL_ODD:
1188          spacing = TESS_FRACTIONAL_ODD;
1189          break;
1190       case TESS_SPACING_FRACTIONAL_EVEN:
1191          spacing = TESS_FRACTIONAL_EVEN;
1192          break;
1193       case TESS_SPACING_UNSPECIFIED:
1194       default:
1195          unreachable("invalid tess spacing");
1196       }
1197       tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) |
1198             A6XX_PC_TESS_CNTL_OUTPUT(output));
1199 
1200       tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1201       tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1202    }
1203 
1204 
1205    if (gs) {
1206       uint32_t vertices_out, invocations, output, vec4_size;
1207       uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
1208 
1209       /* this detects the tu_clear_blit path, which doesn't set ->nir */
1210       if (gs->shader->nir) {
1211          if (hs) {
1212             tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1213          } else {
1214             tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1215          }
1216          vertices_out = gs->shader->nir->info.gs.vertices_out - 1;
1217          output = gl_primitive_to_tess(gs->shader->nir->info.gs.output_primitive);
1218          invocations = gs->shader->nir->info.gs.invocations - 1;
1219          /* Size of per-primitive alloction in ldlw memory in vec4s. */
1220          vec4_size = gs->shader->nir->info.gs.vertices_in *
1221                      DIV_ROUND_UP(prev_stage_output_size, 4);
1222       } else {
1223          vertices_out = 3;
1224          output = TESS_CW_TRIS;
1225          invocations = 0;
1226          vec4_size = 0;
1227       }
1228 
1229       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
1230       tu_cs_emit(cs,
1231             A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
1232             A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
1233             A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
1234 
1235       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
1236       tu_cs_emit(cs, 0xff);
1237 
1238       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1239       tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1240 
1241       uint32_t prim_size = prev_stage_output_size;
1242       if (prim_size > 64)
1243          prim_size = 64;
1244       else if (prim_size == 64)
1245          prim_size = 63;
1246       tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1247       tu_cs_emit(cs, prim_size);
1248    }
1249 }
1250 
1251 static int
tu6_vpc_varying_mode(const struct ir3_shader_variant * fs,uint32_t index,uint8_t * interp_mode,uint8_t * ps_repl_mode)1252 tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
1253                      uint32_t index,
1254                      uint8_t *interp_mode,
1255                      uint8_t *ps_repl_mode)
1256 {
1257    enum
1258    {
1259       INTERP_SMOOTH = 0,
1260       INTERP_FLAT = 1,
1261       INTERP_ZERO = 2,
1262       INTERP_ONE = 3,
1263    };
1264    enum
1265    {
1266       PS_REPL_NONE = 0,
1267       PS_REPL_S = 1,
1268       PS_REPL_T = 2,
1269       PS_REPL_ONE_MINUS_T = 3,
1270    };
1271 
1272    const uint32_t compmask = fs->inputs[index].compmask;
1273 
1274    /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
1275     * fourth component occupy three consecutive varying slots
1276     */
1277    int shift = 0;
1278    *interp_mode = 0;
1279    *ps_repl_mode = 0;
1280    if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
1281       if (compmask & 0x1) {
1282          *ps_repl_mode |= PS_REPL_S << shift;
1283          shift += 2;
1284       }
1285       if (compmask & 0x2) {
1286          *ps_repl_mode |= PS_REPL_T << shift;
1287          shift += 2;
1288       }
1289       if (compmask & 0x4) {
1290          *interp_mode |= INTERP_ZERO << shift;
1291          shift += 2;
1292       }
1293       if (compmask & 0x8) {
1294          *interp_mode |= INTERP_ONE << 6;
1295          shift += 2;
1296       }
1297    } else if (fs->inputs[index].flat) {
1298       for (int i = 0; i < 4; i++) {
1299          if (compmask & (1 << i)) {
1300             *interp_mode |= INTERP_FLAT << shift;
1301             shift += 2;
1302          }
1303       }
1304    }
1305 
1306    return shift;
1307 }
1308 
1309 static void
tu6_emit_vpc_varying_modes(struct tu_cs * cs,const struct ir3_shader_variant * fs)1310 tu6_emit_vpc_varying_modes(struct tu_cs *cs,
1311                            const struct ir3_shader_variant *fs)
1312 {
1313    uint32_t interp_modes[8] = { 0 };
1314    uint32_t ps_repl_modes[8] = { 0 };
1315 
1316    if (fs) {
1317       for (int i = -1;
1318            (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
1319 
1320          /* get the mode for input i */
1321          uint8_t interp_mode;
1322          uint8_t ps_repl_mode;
1323          const int bits =
1324             tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode);
1325 
1326          /* OR the mode into the array */
1327          const uint32_t inloc = fs->inputs[i].inloc * 2;
1328          uint32_t n = inloc / 32;
1329          uint32_t shift = inloc % 32;
1330          interp_modes[n] |= interp_mode << shift;
1331          ps_repl_modes[n] |= ps_repl_mode << shift;
1332          if (shift + bits > 32) {
1333             n++;
1334             shift = 32 - shift;
1335 
1336             interp_modes[n] |= interp_mode >> shift;
1337             ps_repl_modes[n] |= ps_repl_mode >> shift;
1338          }
1339       }
1340    }
1341 
1342    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
1343    tu_cs_emit_array(cs, interp_modes, 8);
1344 
1345    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
1346    tu_cs_emit_array(cs, ps_repl_modes, 8);
1347 }
1348 
1349 void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1350 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1351 {
1352    uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1353    uint32_t ij_regid[IJ_COUNT];
1354    uint32_t smask_in_regid;
1355 
1356    bool sample_shading = fs->per_samp | fs->key.sample_shading;
1357    bool enable_varyings = fs->total_in > 0;
1358 
1359    samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1360    smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1361    face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1362    coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1363    zwcoord_regid   = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1364    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1365       ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1366 
1367    if (fs->num_sampler_prefetch > 0) {
1368       assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL]));
1369       /* also, it seems like ij_pix is *required* to be r0.x */
1370       assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1371    }
1372 
1373    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1374    tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1375          A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) |
1376          0x7000);    // XXX);
1377    for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1378       const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1379       tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) |
1380                      A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) |
1381                      A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) |
1382                      A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) |
1383                      A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) |
1384                      COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) |
1385                      A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd));
1386    }
1387 
1388    if (fs->num_sampler_prefetch > 0) {
1389       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1390       for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1391          const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1392          tu_cs_emit(cs,
1393                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1394                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1395       }
1396    }
1397 
1398    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
1399    tu_cs_emit(cs, 0x7);
1400    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
1401                   A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) |
1402                   A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) |
1403                   A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE]));
1404    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) |
1405                   A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) |
1406                   A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) |
1407                   A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID]));
1408    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
1409                   A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) |
1410                   A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) |
1411                   A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE]));
1412    tu_cs_emit(cs, 0xfcfc);
1413 
1414    enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1415    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1416    tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) |
1417                   COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS));
1418 
1419    bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1420    bool need_size_persamp = false;
1421    if (VALIDREG(ij_regid[IJ_PERSP_SIZE])) {
1422       if (sample_shading)
1423          need_size_persamp = true;
1424       else
1425          need_size = true;
1426    }
1427 
1428    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1429    tu_cs_emit(cs,
1430          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1431          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1432          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1433          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1434          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1435          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1436          COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1437          COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1438          COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1439 
1440    tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1441    tu_cs_emit(cs,
1442          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1443          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1444          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1445          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1446          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1447          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1448          COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1449          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1450          COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1451          COND(fs->fragcoord_compmask != 0,
1452                            A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1453    tu_cs_emit(cs,
1454          A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1455             sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1456          CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1457          CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1458          CONDREG(ij_regid[IJ_PERSP_SIZE], A6XX_RB_RENDER_CONTROL1_SIZE) |
1459          COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1460 
1461    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1462    tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1463 
1464    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1465    tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1466               A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1467                  sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1468 
1469    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1470    tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1471 }
1472 
1473 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs,uint32_t mrt_count,bool dual_src_blend,uint32_t render_components,bool no_earlyz,struct tu_pipeline * pipeline)1474 tu6_emit_fs_outputs(struct tu_cs *cs,
1475                     const struct ir3_shader_variant *fs,
1476                     uint32_t mrt_count, bool dual_src_blend,
1477                     uint32_t render_components,
1478                     bool no_earlyz,
1479                     struct tu_pipeline *pipeline)
1480 {
1481    uint32_t smask_regid, posz_regid, stencilref_regid;
1482 
1483    posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1484    smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1485    stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1486 
1487    uint32_t fragdata_regid[8];
1488    if (fs->color0_mrt) {
1489       fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR);
1490       for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++)
1491          fragdata_regid[i] = fragdata_regid[0];
1492    } else {
1493       for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++)
1494          fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1495    }
1496 
1497    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1498    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1499                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1500                   A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1501                   COND(dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1502    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1503 
1504    uint32_t fs_render_components = 0;
1505 
1506    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8);
1507    for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1508       tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1509                      (COND(fragdata_regid[i] & HALF_REG_ID,
1510                            A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1511 
1512       if (VALIDREG(fragdata_regid[i])) {
1513          fs_render_components |= 0xf << (i * 4);
1514       }
1515    }
1516 
1517    /* dual source blending has an extra fs output in the 2nd slot */
1518    if (dual_src_blend) {
1519       fs_render_components |= 0xf << 4;
1520    }
1521 
1522    /* There is no point in having component enabled which is not written
1523     * by the shader. Per VK spec it is an UB, however a few apps depend on
1524     * attachment not being changed if FS doesn't have corresponding output.
1525     */
1526    fs_render_components &= render_components;
1527 
1528    tu_cs_emit_regs(cs,
1529                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1530 
1531    tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
1532    tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1533                   COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1534                   COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1535                   COND(dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1536    tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count));
1537 
1538    tu_cs_emit_regs(cs,
1539                    A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1540 
1541    if (pipeline) {
1542       pipeline->lrz.fs_has_kill = fs->has_kill;
1543       pipeline->lrz.early_fragment_tests = fs->shader->nir->info.fs.early_fragment_tests;
1544 
1545       if ((fs->shader && !fs->shader->nir->info.fs.early_fragment_tests) &&
1546           (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) {
1547          pipeline->lrz.force_late_z = true;
1548       }
1549    }
1550 }
1551 
1552 static void
tu6_emit_geom_tess_consts(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,uint32_t cps_per_patch)1553 tu6_emit_geom_tess_consts(struct tu_cs *cs,
1554                           const struct ir3_shader_variant *vs,
1555                           const struct ir3_shader_variant *hs,
1556                           const struct ir3_shader_variant *ds,
1557                           const struct ir3_shader_variant *gs,
1558                           uint32_t cps_per_patch)
1559 {
1560    uint32_t num_vertices =
1561          hs ? cps_per_patch : gs->shader->nir->info.gs.vertices_in;
1562 
1563    uint32_t vs_params[4] = {
1564       vs->output_size * num_vertices * 4,  /* vs primitive stride */
1565       vs->output_size * 4,                 /* vs vertex stride */
1566       0,
1567       0,
1568    };
1569    uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param;
1570    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0,
1571                   ARRAY_SIZE(vs_params), vs_params);
1572 
1573    if (hs) {
1574       assert(ds->type != MESA_SHADER_NONE);
1575       uint32_t hs_params[4] = {
1576          vs->output_size * num_vertices * 4,  /* hs primitive stride */
1577          vs->output_size * 4,                 /* hs vertex stride */
1578          hs->output_size,
1579          cps_per_patch,
1580       };
1581 
1582       uint32_t hs_base = hs->const_state->offsets.primitive_param;
1583       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0,
1584                      ARRAY_SIZE(hs_params), hs_params);
1585       if (gs)
1586          num_vertices = gs->shader->nir->info.gs.vertices_in;
1587 
1588       uint32_t ds_params[4] = {
1589          ds->output_size * num_vertices * 4,  /* ds primitive stride */
1590          ds->output_size * 4,                 /* ds vertex stride */
1591          hs->output_size,                     /* hs vertex stride (dwords) */
1592          hs->shader->nir->info.tess.tcs_vertices_out
1593       };
1594 
1595       uint32_t ds_base = ds->const_state->offsets.primitive_param;
1596       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0,
1597                      ARRAY_SIZE(ds_params), ds_params);
1598    }
1599 
1600    if (gs) {
1601       const struct ir3_shader_variant *prev = ds ? ds : vs;
1602       uint32_t gs_params[4] = {
1603          prev->output_size * num_vertices * 4,  /* gs primitive stride */
1604          prev->output_size * 4,                 /* gs vertex stride */
1605          0,
1606          0,
1607       };
1608       uint32_t gs_base = gs->const_state->offsets.primitive_param;
1609       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0,
1610                      ARRAY_SIZE(gs_params), gs_params);
1611    }
1612 }
1613 
1614 static void
tu6_emit_program_config(struct tu_cs * cs,struct tu_pipeline_builder * builder)1615 tu6_emit_program_config(struct tu_cs *cs,
1616                         struct tu_pipeline_builder *builder)
1617 {
1618    gl_shader_stage stage = MESA_SHADER_VERTEX;
1619 
1620    STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1621 
1622    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
1623          .vs_state = true,
1624          .hs_state = true,
1625          .ds_state = true,
1626          .gs_state = true,
1627          .fs_state = true,
1628          .gfx_ibo = true));
1629    for (; stage < ARRAY_SIZE(builder->shaders); stage++) {
1630       tu6_emit_xs_config(cs, stage, builder->variants[stage]);
1631    }
1632 }
1633 
1634 static void
tu6_emit_program(struct tu_cs * cs,struct tu_pipeline_builder * builder,bool binning_pass,struct tu_pipeline * pipeline)1635 tu6_emit_program(struct tu_cs *cs,
1636                  struct tu_pipeline_builder *builder,
1637                  bool binning_pass,
1638                  struct tu_pipeline *pipeline)
1639 {
1640    const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
1641    const struct ir3_shader_variant *bs = builder->binning_variant;
1642    const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
1643    const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
1644    const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY];
1645    const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT];
1646    gl_shader_stage stage = MESA_SHADER_VERTEX;
1647    uint32_t cps_per_patch = builder->create_info->pTessellationState ?
1648       builder->create_info->pTessellationState->patchControlPoints : 0;
1649    bool multi_pos_output = builder->shaders[MESA_SHADER_VERTEX]->multi_pos_output;
1650 
1651   /* Don't use the binning pass variant when GS is present because we don't
1652    * support compiling correct binning pass variants with GS.
1653    */
1654    if (binning_pass && !gs) {
1655       vs = bs;
1656       tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova);
1657       stage++;
1658    }
1659 
1660    for (; stage < ARRAY_SIZE(builder->shaders); stage++) {
1661       const struct ir3_shader_variant *xs = builder->variants[stage];
1662 
1663       if (stage == MESA_SHADER_FRAGMENT && binning_pass)
1664          fs = xs = NULL;
1665 
1666       tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]);
1667    }
1668 
1669    uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1;
1670    uint32_t multiview_cntl = builder->multiview_mask ?
1671       A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1672       A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1673       COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1674       : 0;
1675 
1676    /* Copy what the blob does here. This will emit an extra 0x3f
1677     * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1678     * this is working around yet.
1679     */
1680    if (builder->device->physical_device->info->a6xx.has_cp_reg_write) {
1681       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1682       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1683       tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1684    } else {
1685       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1686    }
1687    tu_cs_emit(cs, multiview_cntl);
1688 
1689    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1690    tu_cs_emit(cs, multiview_cntl);
1691 
1692    if (multiview_cntl &&
1693        builder->device->physical_device->info->a6xx.supports_multiview_mask) {
1694       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1695       tu_cs_emit(cs, builder->multiview_mask);
1696    }
1697 
1698    tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1699    tu_cs_emit(cs, 0);
1700 
1701    tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch);
1702    tu6_emit_vpc_varying_modes(cs, fs);
1703 
1704    bool no_earlyz = builder->depth_attachment_format == VK_FORMAT_S8_UINT;
1705    uint32_t mrt_count = builder->color_attachment_count;
1706    uint32_t render_components = builder->render_components;
1707 
1708    if (builder->alpha_to_coverage) {
1709       /* alpha to coverage can behave like a discard */
1710       no_earlyz = true;
1711       /* alpha value comes from first mrt */
1712       render_components |= 0xf;
1713       if (!mrt_count) {
1714          mrt_count = 1;
1715          /* Disable memory write for dummy mrt because it doesn't get set otherwise */
1716          tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = 0));
1717       }
1718    }
1719 
1720    if (fs) {
1721       tu6_emit_fs_inputs(cs, fs);
1722       tu6_emit_fs_outputs(cs, fs, mrt_count,
1723                           builder->use_dual_src_blend,
1724                           render_components,
1725                           no_earlyz,
1726                           pipeline);
1727    } else {
1728       /* TODO: check if these can be skipped if fs is disabled */
1729       struct ir3_shader_variant dummy_variant = {};
1730       tu6_emit_fs_inputs(cs, &dummy_variant);
1731       tu6_emit_fs_outputs(cs, &dummy_variant, mrt_count,
1732                           builder->use_dual_src_blend,
1733                           render_components,
1734                           no_earlyz,
1735                           NULL);
1736    }
1737 
1738    if (gs || hs) {
1739       tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch);
1740    }
1741 }
1742 
1743 static void
tu6_emit_vertex_input(struct tu_pipeline * pipeline,struct tu_cs * cs,const struct ir3_shader_variant * vs,const VkPipelineVertexInputStateCreateInfo * info)1744 tu6_emit_vertex_input(struct tu_pipeline *pipeline,
1745                       struct tu_cs *cs,
1746                       const struct ir3_shader_variant *vs,
1747                       const VkPipelineVertexInputStateCreateInfo *info)
1748 {
1749    uint32_t vfd_decode_idx = 0;
1750    uint32_t binding_instanced = 0; /* bitmask of instanced bindings */
1751    uint32_t step_rate[MAX_VBS];
1752 
1753    for (uint32_t i = 0; i < info->vertexBindingDescriptionCount; i++) {
1754       const VkVertexInputBindingDescription *binding =
1755          &info->pVertexBindingDescriptions[i];
1756 
1757       if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
1758          tu_cs_emit_regs(cs,
1759                         A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride));
1760       }
1761 
1762       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
1763          binding_instanced |= 1 << binding->binding;
1764 
1765       step_rate[binding->binding] = 1;
1766    }
1767 
1768    const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state =
1769       vk_find_struct_const(info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
1770    if (div_state) {
1771       for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) {
1772          const VkVertexInputBindingDivisorDescriptionEXT *desc =
1773             &div_state->pVertexBindingDivisors[i];
1774          step_rate[desc->binding] = desc->divisor;
1775       }
1776    }
1777 
1778    /* TODO: emit all VFD_DECODE/VFD_DEST_CNTL in same (two) pkt4 */
1779 
1780    for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
1781       const VkVertexInputAttributeDescription *attr =
1782          &info->pVertexAttributeDescriptions[i];
1783       uint32_t input_idx;
1784 
1785       for (input_idx = 0; input_idx < vs->inputs_count; input_idx++) {
1786          if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) == attr->location)
1787             break;
1788       }
1789 
1790       /* attribute not used, skip it */
1791       if (input_idx == vs->inputs_count)
1792          continue;
1793 
1794       const struct tu_native_format format = tu6_format_vtx(attr->format);
1795       tu_cs_emit_regs(cs,
1796                       A6XX_VFD_DECODE_INSTR(vfd_decode_idx,
1797                         .idx = attr->binding,
1798                         .offset = attr->offset,
1799                         .instanced = binding_instanced & (1 << attr->binding),
1800                         .format = format.fmt,
1801                         .swap = format.swap,
1802                         .unk30 = 1,
1803                         ._float = !vk_format_is_int(attr->format)),
1804                       A6XX_VFD_DECODE_STEP_RATE(vfd_decode_idx, step_rate[attr->binding]));
1805 
1806       tu_cs_emit_regs(cs,
1807                       A6XX_VFD_DEST_CNTL_INSTR(vfd_decode_idx,
1808                         .writemask = vs->inputs[input_idx].compmask,
1809                         .regid = vs->inputs[input_idx].regid));
1810 
1811       vfd_decode_idx++;
1812    }
1813 
1814    tu_cs_emit_regs(cs,
1815                    A6XX_VFD_CONTROL_0(
1816                      .fetch_cnt = vfd_decode_idx, /* decode_cnt for binning pass ? */
1817                      .decode_cnt = vfd_decode_idx));
1818 }
1819 
1820 void
tu6_emit_viewport(struct tu_cs * cs,const VkViewport * viewports,uint32_t num_viewport)1821 tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport)
1822 {
1823    VkExtent2D guardband = {511, 511};
1824 
1825    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6);
1826    for (uint32_t i = 0; i < num_viewport; i++) {
1827       const VkViewport *viewport = &viewports[i];
1828       float offsets[3];
1829       float scales[3];
1830       scales[0] = viewport->width / 2.0f;
1831       scales[1] = viewport->height / 2.0f;
1832       scales[2] = viewport->maxDepth - viewport->minDepth;
1833       offsets[0] = viewport->x + scales[0];
1834       offsets[1] = viewport->y + scales[1];
1835       offsets[2] = viewport->minDepth;
1836       for (uint32_t j = 0; j < 3; j++) {
1837          tu_cs_emit(cs, fui(offsets[j]));
1838          tu_cs_emit(cs, fui(scales[j]));
1839       }
1840 
1841       guardband.width =
1842          MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
1843       guardband.height =
1844          MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
1845    }
1846 
1847    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2);
1848    for (uint32_t i = 0; i < num_viewport; i++) {
1849       const VkViewport *viewport = &viewports[i];
1850       VkOffset2D min;
1851       VkOffset2D max;
1852       min.x = (int32_t) viewport->x;
1853       max.x = (int32_t) ceilf(viewport->x + viewport->width);
1854       if (viewport->height >= 0.0f) {
1855          min.y = (int32_t) viewport->y;
1856          max.y = (int32_t) ceilf(viewport->y + viewport->height);
1857       } else {
1858          min.y = (int32_t)(viewport->y + viewport->height);
1859          max.y = (int32_t) ceilf(viewport->y);
1860       }
1861       /* the spec allows viewport->height to be 0.0f */
1862       if (min.y == max.y)
1863          max.y++;
1864       /* allow viewport->width = 0.0f for un-initialized viewports: */
1865       if (min.x == max.x)
1866          max.x++;
1867 
1868       min.x = MAX2(min.x, 0);
1869       min.y = MAX2(min.y, 0);
1870 
1871       assert(min.x < max.x);
1872       assert(min.y < max.y);
1873       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
1874                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
1875       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(max.x - 1) |
1876                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(max.y - 1));
1877    }
1878 
1879    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2);
1880    for (uint32_t i = 0; i < num_viewport; i++) {
1881       const VkViewport *viewport = &viewports[i];
1882       tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
1883       tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
1884    }
1885    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
1886    tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
1887                   A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
1888 
1889    /* TODO: what to do about this and multi viewport ? */
1890    float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
1891    float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
1892 
1893    tu_cs_emit_regs(cs,
1894                    A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
1895                    A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
1896 }
1897 
1898 void
tu6_emit_scissor(struct tu_cs * cs,const VkRect2D * scissors,uint32_t scissor_count)1899 tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count)
1900 {
1901    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2);
1902 
1903    for (uint32_t i = 0; i < scissor_count; i++) {
1904       const VkRect2D *scissor = &scissors[i];
1905 
1906       uint32_t min_x = scissor->offset.x;
1907       uint32_t min_y = scissor->offset.y;
1908       uint32_t max_x = min_x + scissor->extent.width - 1;
1909       uint32_t max_y = min_y + scissor->extent.height - 1;
1910 
1911       if (!scissor->extent.width || !scissor->extent.height) {
1912          min_x = min_y = 1;
1913          max_x = max_y = 0;
1914       } else {
1915          /* avoid overflow */
1916          uint32_t scissor_max = BITFIELD_MASK(15);
1917          min_x = MIN2(scissor_max, min_x);
1918          min_y = MIN2(scissor_max, min_y);
1919          max_x = MIN2(scissor_max, max_x);
1920          max_y = MIN2(scissor_max, max_y);
1921       }
1922 
1923       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
1924                      A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
1925       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
1926                      A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
1927    }
1928 }
1929 
1930 void
tu6_emit_sample_locations(struct tu_cs * cs,const VkSampleLocationsInfoEXT * samp_loc)1931 tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc)
1932 {
1933    if (!samp_loc) {
1934       tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
1935       tu_cs_emit(cs, 0);
1936 
1937       tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
1938       tu_cs_emit(cs, 0);
1939 
1940       tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
1941       tu_cs_emit(cs, 0);
1942       return;
1943    }
1944 
1945    assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount);
1946    assert(samp_loc->sampleLocationGridSize.width == 1);
1947    assert(samp_loc->sampleLocationGridSize.height == 1);
1948 
1949    uint32_t sample_config =
1950       A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE;
1951    uint32_t sample_locations = 0;
1952    for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) {
1953       sample_locations |=
1954          (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) |
1955           A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8;
1956    }
1957 
1958    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2);
1959    tu_cs_emit(cs, sample_config);
1960    tu_cs_emit(cs, sample_locations);
1961 
1962    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2);
1963    tu_cs_emit(cs, sample_config);
1964    tu_cs_emit(cs, sample_locations);
1965 
1966    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2);
1967    tu_cs_emit(cs, sample_config);
1968    tu_cs_emit(cs, sample_locations);
1969 }
1970 
1971 static uint32_t
tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo * rast_info,enum a5xx_line_mode line_mode,bool multiview)1972 tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info,
1973                  enum a5xx_line_mode line_mode,
1974                  bool multiview)
1975 {
1976    uint32_t gras_su_cntl = 0;
1977 
1978    if (rast_info->cullMode & VK_CULL_MODE_FRONT_BIT)
1979       gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;
1980    if (rast_info->cullMode & VK_CULL_MODE_BACK_BIT)
1981       gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;
1982 
1983    if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE)
1984       gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
1985 
1986    gras_su_cntl |=
1987       A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f);
1988 
1989    if (rast_info->depthBiasEnable)
1990       gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
1991 
1992    gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode);
1993 
1994    if (multiview) {
1995       gras_su_cntl |=
1996          A6XX_GRAS_SU_CNTL_UNK17 |
1997          A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE;
1998    }
1999 
2000    return gras_su_cntl;
2001 }
2002 
2003 void
tu6_emit_depth_bias(struct tu_cs * cs,float constant_factor,float clamp,float slope_factor)2004 tu6_emit_depth_bias(struct tu_cs *cs,
2005                     float constant_factor,
2006                     float clamp,
2007                     float slope_factor)
2008 {
2009    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
2010    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value);
2011    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value);
2012    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value);
2013 }
2014 
2015 static uint32_t
tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState * att,bool has_alpha)2016 tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att,
2017                          bool has_alpha)
2018 {
2019    const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->colorBlendOp);
2020    const enum adreno_rb_blend_factor src_color_factor = tu6_blend_factor(
2021       has_alpha ? att->srcColorBlendFactor
2022                 : tu_blend_factor_no_dst_alpha(att->srcColorBlendFactor));
2023    const enum adreno_rb_blend_factor dst_color_factor = tu6_blend_factor(
2024       has_alpha ? att->dstColorBlendFactor
2025                 : tu_blend_factor_no_dst_alpha(att->dstColorBlendFactor));
2026    const enum a3xx_rb_blend_opcode alpha_op = tu6_blend_op(att->alphaBlendOp);
2027    const enum adreno_rb_blend_factor src_alpha_factor =
2028       tu6_blend_factor(att->srcAlphaBlendFactor);
2029    const enum adreno_rb_blend_factor dst_alpha_factor =
2030       tu6_blend_factor(att->dstAlphaBlendFactor);
2031 
2032    return A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(src_color_factor) |
2033           A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(color_op) |
2034           A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(dst_color_factor) |
2035           A6XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(src_alpha_factor) |
2036           A6XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(alpha_op) |
2037           A6XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(dst_alpha_factor);
2038 }
2039 
2040 static uint32_t
tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState * att,uint32_t rb_mrt_control_rop,bool has_alpha)2041 tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att,
2042                    uint32_t rb_mrt_control_rop,
2043                    bool has_alpha)
2044 {
2045    uint32_t rb_mrt_control =
2046       A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask);
2047 
2048    rb_mrt_control |= rb_mrt_control_rop;
2049 
2050    if (att->blendEnable) {
2051       rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND;
2052 
2053       if (has_alpha)
2054          rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND2;
2055    }
2056 
2057    return rb_mrt_control;
2058 }
2059 
2060 static void
tu6_emit_rb_mrt_controls(struct tu_cs * cs,const VkPipelineColorBlendStateCreateInfo * blend_info,const VkFormat attachment_formats[MAX_RTS],uint32_t * blend_enable_mask)2061 tu6_emit_rb_mrt_controls(struct tu_cs *cs,
2062                          const VkPipelineColorBlendStateCreateInfo *blend_info,
2063                          const VkFormat attachment_formats[MAX_RTS],
2064                          uint32_t *blend_enable_mask)
2065 {
2066    *blend_enable_mask = 0;
2067 
2068    bool rop_reads_dst = false;
2069    uint32_t rb_mrt_control_rop = 0;
2070    if (blend_info->logicOpEnable) {
2071       rop_reads_dst = tu_logic_op_reads_dst(blend_info->logicOp);
2072       rb_mrt_control_rop =
2073          A6XX_RB_MRT_CONTROL_ROP_ENABLE |
2074          A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp));
2075    }
2076 
2077    for (uint32_t i = 0; i < blend_info->attachmentCount; i++) {
2078       const VkPipelineColorBlendAttachmentState *att =
2079          &blend_info->pAttachments[i];
2080       const VkFormat format = attachment_formats[i];
2081 
2082       uint32_t rb_mrt_control = 0;
2083       uint32_t rb_mrt_blend_control = 0;
2084       if (format != VK_FORMAT_UNDEFINED) {
2085          const bool has_alpha = vk_format_has_alpha(format);
2086 
2087          rb_mrt_control =
2088             tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha);
2089          rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha);
2090 
2091          if (att->blendEnable || rop_reads_dst)
2092             *blend_enable_mask |= 1 << i;
2093       }
2094 
2095       tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2);
2096       tu_cs_emit(cs, rb_mrt_control);
2097       tu_cs_emit(cs, rb_mrt_blend_control);
2098    }
2099 }
2100 
2101 static void
tu6_emit_blend_control(struct tu_cs * cs,uint32_t blend_enable_mask,bool dual_src_blend,const VkPipelineMultisampleStateCreateInfo * msaa_info)2102 tu6_emit_blend_control(struct tu_cs *cs,
2103                        uint32_t blend_enable_mask,
2104                        bool dual_src_blend,
2105                        const VkPipelineMultisampleStateCreateInfo *msaa_info)
2106 {
2107    const uint32_t sample_mask =
2108       msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff)
2109                              : ((1 << msaa_info->rasterizationSamples) - 1);
2110 
2111    tu_cs_emit_regs(cs,
2112                    A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
2113                                       .dual_color_in_enable = dual_src_blend,
2114                                       .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
2115                                       .unk8 = true));
2116 
2117    /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
2118    tu_cs_emit_regs(cs,
2119                    A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
2120                                       .independent_blend = true,
2121                                       .sample_mask = sample_mask,
2122                                       .dual_color_in_enable = dual_src_blend,
2123                                       .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
2124                                       .alpha_to_one = msaa_info->alphaToOneEnable));
2125 }
2126 
2127 static uint32_t
calc_pvtmem_size(struct tu_device * dev,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes)2128 calc_pvtmem_size(struct tu_device *dev, struct tu_pvtmem_config *config,
2129                  uint32_t pvtmem_bytes)
2130 {
2131    uint32_t per_fiber_size = ALIGN(pvtmem_bytes, 512);
2132    uint32_t per_sp_size =
2133       ALIGN(per_fiber_size * dev->physical_device->info->a6xx.fibers_per_sp, 1 << 12);
2134 
2135    if (config) {
2136       config->per_fiber_size = per_fiber_size;
2137       config->per_sp_size = per_sp_size;
2138    }
2139 
2140    return dev->physical_device->info->num_sp_cores * per_sp_size;
2141 }
2142 
2143 static VkResult
tu_setup_pvtmem(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes,bool per_wave)2144 tu_setup_pvtmem(struct tu_device *dev,
2145                 struct tu_pipeline *pipeline,
2146                 struct tu_pvtmem_config *config,
2147                 uint32_t pvtmem_bytes, bool per_wave)
2148 {
2149    if (!pvtmem_bytes) {
2150       memset(config, 0, sizeof(*config));
2151       return VK_SUCCESS;
2152    }
2153 
2154    uint32_t total_size = calc_pvtmem_size(dev, config, pvtmem_bytes);
2155    config->per_wave = per_wave;
2156 
2157    VkResult result =
2158       tu_bo_init_new(dev, &pipeline->pvtmem_bo, total_size,
2159                      TU_BO_ALLOC_NO_FLAGS);
2160    if (result != VK_SUCCESS)
2161       return result;
2162 
2163    config->iova = pipeline->pvtmem_bo.iova;
2164 
2165    return result;
2166 }
2167 
2168 
2169 static VkResult
tu_pipeline_allocate_cs(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pipeline_builder * builder,struct ir3_shader_variant * compute)2170 tu_pipeline_allocate_cs(struct tu_device *dev,
2171                         struct tu_pipeline *pipeline,
2172                         struct tu_pipeline_builder *builder,
2173                         struct ir3_shader_variant *compute)
2174 {
2175    uint32_t size = 2048 + tu6_load_state_size(pipeline, compute);
2176 
2177    /* graphics case: */
2178    if (builder) {
2179       uint32_t pvtmem_bytes = 0;
2180       for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) {
2181          if (builder->variants[i]) {
2182             size += builder->variants[i]->info.size / 4;
2183             pvtmem_bytes = MAX2(pvtmem_bytes, builder->variants[i]->pvtmem_size);
2184          }
2185       }
2186 
2187       size += builder->binning_variant->info.size / 4;
2188       pvtmem_bytes = MAX2(pvtmem_bytes, builder->binning_variant->pvtmem_size);
2189 
2190       size += calc_pvtmem_size(dev, NULL, pvtmem_bytes) / 4;
2191 
2192       builder->additional_cs_reserve_size = 0;
2193       for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) {
2194          struct ir3_shader_variant *variant = builder->variants[i];
2195          if (variant) {
2196             builder->additional_cs_reserve_size +=
2197                tu_xs_get_additional_cs_size_dwords(variant);
2198 
2199             if (variant->binning) {
2200                builder->additional_cs_reserve_size +=
2201                   tu_xs_get_additional_cs_size_dwords(variant->binning);
2202             }
2203          }
2204       }
2205 
2206       size += builder->additional_cs_reserve_size;
2207    } else {
2208       size += compute->info.size / 4;
2209       size += calc_pvtmem_size(dev, NULL, compute->pvtmem_size) / 4;
2210 
2211       size += tu_xs_get_additional_cs_size_dwords(compute);
2212    }
2213 
2214    tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
2215 
2216    /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note
2217     * that LOAD_STATE can potentially take up a large amount of space so we
2218     * calculate its size explicitly.
2219    */
2220    return tu_cs_reserve_space(&pipeline->cs, size);
2221 }
2222 
2223 static void
tu_pipeline_shader_key_init(struct ir3_shader_key * key,const struct tu_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pipeline_info)2224 tu_pipeline_shader_key_init(struct ir3_shader_key *key,
2225                             const struct tu_pipeline *pipeline,
2226                             const VkGraphicsPipelineCreateInfo *pipeline_info)
2227 {
2228    for (uint32_t i = 0; i < pipeline_info->stageCount; i++) {
2229       if (pipeline_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) {
2230          key->has_gs = true;
2231          break;
2232       }
2233    }
2234 
2235    if (pipeline_info->pRasterizationState->rasterizerDiscardEnable &&
2236        !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD)))
2237       return;
2238 
2239    const VkPipelineMultisampleStateCreateInfo *msaa_info = pipeline_info->pMultisampleState;
2240    const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
2241       vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
2242    if (msaa_info->rasterizationSamples > 1 ||
2243        /* also set msaa key when sample location is not the default
2244         * since this affects varying interpolation */
2245        (sample_locations && sample_locations->sampleLocationsEnable)) {
2246       key->msaa = true;
2247    }
2248 
2249    /* note: not actually used by ir3, just checked in tu6_emit_fs_inputs */
2250    if (msaa_info->sampleShadingEnable)
2251       key->sample_shading = true;
2252 
2253    /* We set this after we compile to NIR because we need the prim mode */
2254    key->tessellation = IR3_TESS_NONE;
2255 }
2256 
2257 static uint32_t
tu6_get_tessmode(struct tu_shader * shader)2258 tu6_get_tessmode(struct tu_shader* shader)
2259 {
2260    uint32_t primitive_mode = shader->ir3_shader->nir->info.tess.primitive_mode;
2261    switch (primitive_mode) {
2262    case GL_ISOLINES:
2263       return IR3_TESS_ISOLINES;
2264    case GL_TRIANGLES:
2265       return IR3_TESS_TRIANGLES;
2266    case GL_QUADS:
2267       return IR3_TESS_QUADS;
2268    case GL_NONE:
2269       return IR3_TESS_NONE;
2270    default:
2271       unreachable("bad tessmode");
2272    }
2273 }
2274 
2275 static uint64_t
tu_upload_variant(struct tu_pipeline * pipeline,const struct ir3_shader_variant * variant)2276 tu_upload_variant(struct tu_pipeline *pipeline,
2277                   const struct ir3_shader_variant *variant)
2278 {
2279    struct tu_cs_memory memory;
2280 
2281    if (!variant)
2282       return 0;
2283 
2284    /* this expects to get enough alignment because shaders are allocated first
2285     * and total size is always aligned correctly
2286     * note: an assert in tu6_emit_xs_config validates the alignment
2287     */
2288    tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
2289 
2290    memcpy(memory.map, variant->bin, variant->info.size);
2291    return memory.iova;
2292 }
2293 
2294 static void
tu_append_executable(struct tu_pipeline * pipeline,struct ir3_shader_variant * variant,char * nir_from_spirv)2295 tu_append_executable(struct tu_pipeline *pipeline, struct ir3_shader_variant *variant,
2296                      char *nir_from_spirv)
2297 {
2298    ralloc_steal(pipeline->executables_mem_ctx, variant->disasm_info.nir);
2299    ralloc_steal(pipeline->executables_mem_ctx, variant->disasm_info.disasm);
2300 
2301    struct tu_pipeline_executable exe = {
2302       .stage = variant->shader->type,
2303       .nir_from_spirv = nir_from_spirv,
2304       .nir_final = variant->disasm_info.nir,
2305       .disasm = variant->disasm_info.disasm,
2306       .stats = variant->info,
2307       .is_binning = variant->binning_pass,
2308    };
2309 
2310    util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
2311 }
2312 
2313 static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2314 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
2315                                     struct tu_pipeline *pipeline)
2316 {
2317    const struct ir3_compiler *compiler = builder->device->compiler;
2318    const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
2319       NULL
2320    };
2321    for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
2322       gl_shader_stage stage =
2323          vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
2324       stage_infos[stage] = &builder->create_info->pStages[i];
2325    }
2326 
2327    struct ir3_shader_key key = {};
2328    tu_pipeline_shader_key_init(&key, pipeline, builder->create_info);
2329 
2330    nir_shader *nir[ARRAY_SIZE(builder->shaders)] = { NULL };
2331 
2332    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2333         stage < ARRAY_SIZE(nir); stage++) {
2334       const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2335       if (!stage_info)
2336          continue;
2337 
2338       nir[stage] = tu_spirv_to_nir(builder->device, stage_info, stage);
2339       if (!nir[stage])
2340          return VK_ERROR_OUT_OF_HOST_MEMORY;
2341    }
2342 
2343    if (!nir[MESA_SHADER_FRAGMENT]) {
2344          const nir_shader_compiler_options *nir_options =
2345             ir3_get_compiler_options(builder->device->compiler);
2346          nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
2347                                                            nir_options,
2348                                                            "noop_fs");
2349          nir[MESA_SHADER_FRAGMENT] = fs_b.shader;
2350    }
2351 
2352    const bool executable_info = builder->create_info->flags &
2353       VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
2354 
2355    char *nir_initial_disasm[ARRAY_SIZE(builder->shaders)] = { NULL };
2356 
2357    if (executable_info) {
2358       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2359             stage < ARRAY_SIZE(nir); stage++) {
2360          if (!nir[stage])
2361             continue;
2362 
2363          nir_initial_disasm[stage] =
2364             nir_shader_as_str(nir[stage], pipeline->executables_mem_ctx);
2365       }
2366    }
2367 
2368    /* TODO do intra-stage linking here */
2369 
2370    uint32_t desc_sets = 0;
2371    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2372         stage < ARRAY_SIZE(nir); stage++) {
2373       if (!nir[stage])
2374          continue;
2375 
2376       struct tu_shader *shader =
2377          tu_shader_create(builder->device, nir[stage],
2378                           builder->multiview_mask, builder->layout,
2379                           builder->alloc);
2380       if (!shader)
2381          return VK_ERROR_OUT_OF_HOST_MEMORY;
2382 
2383       /* In SPIR-V generated from GLSL, the primitive mode is specified in the
2384        * tessellation evaluation shader, but in SPIR-V generated from HLSL,
2385        * the mode is specified in the tessellation control shader. */
2386       if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) &&
2387           key.tessellation == IR3_TESS_NONE) {
2388          key.tessellation = tu6_get_tessmode(shader);
2389       }
2390 
2391       if (stage > MESA_SHADER_TESS_CTRL) {
2392          if (stage == MESA_SHADER_FRAGMENT) {
2393             key.tcs_store_primid = key.tcs_store_primid ||
2394                (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2395          } else {
2396             key.tcs_store_primid = key.tcs_store_primid ||
2397                BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2398          }
2399       }
2400 
2401       /* Keep track of the status of each shader's active descriptor sets,
2402        * which is set in tu_lower_io. */
2403       desc_sets |= shader->active_desc_sets;
2404 
2405       builder->shaders[stage] = shader;
2406    }
2407    pipeline->active_desc_sets = desc_sets;
2408 
2409    struct tu_shader *last_shader = builder->shaders[MESA_SHADER_GEOMETRY];
2410    if (!last_shader)
2411       last_shader = builder->shaders[MESA_SHADER_TESS_EVAL];
2412    if (!last_shader)
2413       last_shader = builder->shaders[MESA_SHADER_VERTEX];
2414 
2415    uint64_t outputs_written = last_shader->ir3_shader->nir->info.outputs_written;
2416 
2417    key.layer_zero = !(outputs_written & VARYING_BIT_LAYER);
2418    key.view_zero = !(outputs_written & VARYING_BIT_VIEWPORT);
2419 
2420    pipeline->tess.patch_type = key.tessellation;
2421 
2422    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2423         stage < ARRAY_SIZE(builder->shaders); stage++) {
2424       if (!builder->shaders[stage])
2425          continue;
2426 
2427       bool created;
2428       builder->variants[stage] =
2429          ir3_shader_get_variant(builder->shaders[stage]->ir3_shader,
2430                                 &key, false, executable_info, &created);
2431       if (!builder->variants[stage])
2432          return VK_ERROR_OUT_OF_HOST_MEMORY;
2433    }
2434 
2435    uint32_t safe_constlens = ir3_trim_constlen(builder->variants, compiler);
2436 
2437    key.safe_constlen = true;
2438 
2439    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2440         stage < ARRAY_SIZE(builder->shaders); stage++) {
2441       if (!builder->shaders[stage])
2442          continue;
2443 
2444       if (safe_constlens & (1 << stage)) {
2445          bool created;
2446          builder->variants[stage] =
2447             ir3_shader_get_variant(builder->shaders[stage]->ir3_shader,
2448                                    &key, false, executable_info, &created);
2449          if (!builder->variants[stage])
2450             return VK_ERROR_OUT_OF_HOST_MEMORY;
2451       }
2452    }
2453 
2454    const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX];
2455    struct ir3_shader_variant *variant;
2456 
2457    if (vs->ir3_shader->stream_output.num_outputs ||
2458        !ir3_has_binning_vs(&key)) {
2459       variant = builder->variants[MESA_SHADER_VERTEX];
2460    } else {
2461       bool created;
2462       key.safe_constlen = !!(safe_constlens & (1 << MESA_SHADER_VERTEX));
2463       variant = ir3_shader_get_variant(vs->ir3_shader, &key,
2464                                        true, executable_info, &created);
2465       if (!variant)
2466          return VK_ERROR_OUT_OF_HOST_MEMORY;
2467    }
2468 
2469    builder->binning_variant = variant;
2470 
2471    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2472          stage < ARRAY_SIZE(nir); stage++) {
2473       if (builder->variants[stage]) {
2474          tu_append_executable(pipeline, builder->variants[stage],
2475             nir_initial_disasm[stage]);
2476       }
2477    }
2478 
2479    if (builder->binning_variant != builder->variants[MESA_SHADER_VERTEX]) {
2480       tu_append_executable(pipeline, builder->binning_variant, NULL);
2481    }
2482 
2483    return VK_SUCCESS;
2484 }
2485 
2486 static void
tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2487 tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder,
2488                                   struct tu_pipeline *pipeline)
2489 {
2490    const VkPipelineDynamicStateCreateInfo *dynamic_info =
2491       builder->create_info->pDynamicState;
2492 
2493    pipeline->gras_su_cntl_mask = ~0u;
2494    pipeline->rb_depth_cntl_mask = ~0u;
2495    pipeline->rb_stencil_cntl_mask = ~0u;
2496    pipeline->pc_raster_cntl_mask = ~0u;
2497    pipeline->vpc_unknown_9107_mask = ~0u;
2498 
2499    if (!dynamic_info)
2500       return;
2501 
2502    for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) {
2503       VkDynamicState state = dynamic_info->pDynamicStates[i];
2504       switch (state) {
2505       case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE:
2506          if (state == VK_DYNAMIC_STATE_LINE_WIDTH)
2507             pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
2508          pipeline->dynamic_state_mask |= BIT(state);
2509          break;
2510       case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
2511          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS);
2512          break;
2513       case VK_DYNAMIC_STATE_CULL_MODE_EXT:
2514          pipeline->gras_su_cntl_mask &=
2515             ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT);
2516          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2517          break;
2518       case VK_DYNAMIC_STATE_FRONT_FACE_EXT:
2519          pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
2520          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2521          break;
2522       case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT:
2523          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY);
2524          break;
2525       case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT:
2526          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE);
2527          break;
2528       case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT:
2529          pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT);
2530          break;
2531       case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT:
2532          pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR);
2533          break;
2534       case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT:
2535          pipeline->rb_depth_cntl_mask &=
2536             ~(A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
2537          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2538          break;
2539       case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT:
2540          pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2541          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2542          break;
2543       case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT:
2544          pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
2545          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2546          break;
2547       case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT:
2548          pipeline->rb_depth_cntl_mask &=
2549             ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
2550          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2551          break;
2552       case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT:
2553          pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2554                                              A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2555                                              A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
2556          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
2557          break;
2558       case VK_DYNAMIC_STATE_STENCIL_OP_EXT:
2559          pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_FUNC__MASK |
2560                                              A6XX_RB_STENCIL_CONTROL_FAIL__MASK |
2561                                              A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |
2562                                              A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK |
2563                                              A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |
2564                                              A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |
2565                                              A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |
2566                                              A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);
2567          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
2568          break;
2569       case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT:
2570          pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET;
2571          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2572          break;
2573       case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT:
2574          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE);
2575          break;
2576       case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT:
2577          pipeline->pc_raster_cntl_mask &= ~A6XX_PC_RASTER_CNTL_DISCARD;
2578          pipeline->vpc_unknown_9107_mask &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
2579          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD);
2580          break;
2581       default:
2582          assert(!"unsupported dynamic state");
2583          break;
2584       }
2585    }
2586 }
2587 
2588 static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage * link,struct tu_shader * shader,struct ir3_shader_variant * v)2589 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
2590                         struct tu_shader *shader,
2591                         struct ir3_shader_variant *v)
2592 {
2593    link->const_state = *ir3_const_state(v);
2594    link->constlen = v->constlen;
2595    link->push_consts = shader->push_consts;
2596 }
2597 
2598 static void
tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2599 tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
2600                                         struct tu_pipeline *pipeline)
2601 {
2602    struct tu_cs prog_cs;
2603 
2604    /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
2605     * else that could depend on that state (like push constants)
2606     *
2607     * Note also that this always uses the full VS even in binning pass.  The
2608     * binning pass variant has the same const layout as the full VS, and
2609     * the constlen for the VS will be the same or greater than the constlen
2610     * for the binning pass variant.  It is required that the constlen state
2611     * matches between binning and draw passes, as some parts of the push
2612     * consts are emitted in state groups that are shared between the binning
2613     * and draw passes.
2614     */
2615    tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
2616    tu6_emit_program_config(&prog_cs, builder);
2617    pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2618 
2619    tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
2620    tu6_emit_program(&prog_cs, builder, false, pipeline);
2621    pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2622 
2623    tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
2624    tu6_emit_program(&prog_cs, builder, true, pipeline);
2625    pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2626 
2627    VkShaderStageFlags stages = 0;
2628    for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
2629       stages |= builder->create_info->pStages[i].stage;
2630    }
2631    pipeline->active_stages = stages;
2632 
2633    for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders); i++) {
2634       if (!builder->shaders[i])
2635          continue;
2636 
2637       tu_pipeline_set_linkage(&pipeline->program.link[i],
2638                               builder->shaders[i],
2639                               builder->variants[i]);
2640    }
2641 }
2642 
2643 static void
tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2644 tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder,
2645                                        struct tu_pipeline *pipeline)
2646 {
2647    const VkPipelineVertexInputStateCreateInfo *vi_info =
2648       builder->create_info->pVertexInputState;
2649    const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
2650    const struct ir3_shader_variant *bs = builder->binning_variant;
2651 
2652    /* Bindings may contain holes */
2653    for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
2654       pipeline->num_vbs =
2655          MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1);
2656    }
2657 
2658    struct tu_cs vi_cs;
2659    tu_cs_begin_sub_stream(&pipeline->cs,
2660                           MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs);
2661    tu6_emit_vertex_input(pipeline, &vi_cs, vs, vi_info);
2662    pipeline->vi.state = tu_cs_end_draw_state(&pipeline->cs, &vi_cs);
2663 
2664    if (bs) {
2665       tu_cs_begin_sub_stream(&pipeline->cs,
2666                              MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs);
2667       tu6_emit_vertex_input(pipeline, &vi_cs, bs, vi_info);
2668       pipeline->vi.binning_state =
2669          tu_cs_end_draw_state(&pipeline->cs, &vi_cs);
2670    }
2671 }
2672 
2673 static void
tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2674 tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder,
2675                                          struct tu_pipeline *pipeline)
2676 {
2677    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
2678       builder->create_info->pInputAssemblyState;
2679 
2680    pipeline->ia.primtype = tu6_primtype(ia_info->topology);
2681    pipeline->ia.primitive_restart = ia_info->primitiveRestartEnable;
2682 }
2683 
2684 static bool
tu_pipeline_static_state(struct tu_pipeline * pipeline,struct tu_cs * cs,uint32_t id,uint32_t size)2685 tu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs,
2686                          uint32_t id, uint32_t size)
2687 {
2688    assert(id < ARRAY_SIZE(pipeline->dynamic_state));
2689 
2690    if (pipeline->dynamic_state_mask & BIT(id))
2691       return false;
2692 
2693    pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size);
2694    return true;
2695 }
2696 
2697 static void
tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2698 tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
2699                                        struct tu_pipeline *pipeline)
2700 {
2701    if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) ||
2702        !(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT))
2703       return;
2704 
2705    const VkPipelineTessellationStateCreateInfo *tess_info =
2706       builder->create_info->pTessellationState;
2707 
2708    assert(pipeline->ia.primtype == DI_PT_PATCHES0);
2709    assert(tess_info->patchControlPoints <= 32);
2710    pipeline->ia.primtype += tess_info->patchControlPoints;
2711    const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info =
2712          vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
2713    pipeline->tess.upper_left_domain_origin = !domain_info ||
2714          domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
2715    const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
2716    const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
2717    pipeline->tess.param_stride = hs->output_size * 4;
2718    pipeline->tess.hs_bo_regid = hs->const_state->offsets.primitive_param + 1;
2719    pipeline->tess.ds_bo_regid = ds->const_state->offsets.primitive_param + 1;
2720 }
2721 
2722 static void
tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2723 tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder,
2724                                    struct tu_pipeline *pipeline)
2725 {
2726    /* The spec says:
2727     *
2728     *    pViewportState is a pointer to an instance of the
2729     *    VkPipelineViewportStateCreateInfo structure, and is ignored if the
2730     *    pipeline has rasterization disabled."
2731     *
2732     * We leave the relevant registers stale in that case.
2733     */
2734    if (builder->rasterizer_discard)
2735       return;
2736 
2737    const VkPipelineViewportStateCreateInfo *vp_info =
2738       builder->create_info->pViewportState;
2739 
2740    struct tu_cs cs;
2741 
2742    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount))
2743       tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount);
2744 
2745    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount))
2746       tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount);
2747 }
2748 
2749 static void
tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2750 tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
2751                                         struct tu_pipeline *pipeline)
2752 {
2753    const VkPipelineRasterizationStateCreateInfo *rast_info =
2754       builder->create_info->pRasterizationState;
2755 
2756    enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode);
2757 
2758    bool depth_clip_disable = rast_info->depthClampEnable;
2759 
2760    const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
2761       vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
2762    if (depth_clip_state)
2763       depth_clip_disable = !depth_clip_state->depthClipEnable;
2764 
2765    pipeline->line_mode = RECTANGULAR;
2766 
2767    if (tu6_primtype_line(pipeline->ia.primtype)) {
2768       const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state =
2769          vk_find_struct_const(rast_info->pNext,
2770                               PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
2771 
2772       if (rast_line_state && rast_line_state->lineRasterizationMode ==
2773                VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
2774          pipeline->line_mode = BRESENHAM;
2775       }
2776    }
2777 
2778    struct tu_cs cs;
2779    uint32_t cs_size = 9 +
2780       (builder->device->physical_device->info->a6xx.has_shading_rate ? 8 : 0) +
2781       (builder->emit_msaa_state ? 11 : 0);
2782    pipeline->rast_state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size);
2783 
2784    tu_cs_emit_regs(&cs,
2785                    A6XX_GRAS_CL_CNTL(
2786                      .znear_clip_disable = depth_clip_disable,
2787                      .zfar_clip_disable = depth_clip_disable,
2788                      /* TODO should this be depth_clip_disable instead? */
2789                      .unk5 = rast_info->depthClampEnable,
2790                      .zero_gb_scale_z = 1,
2791                      .vp_clip_code_ignore = 1));
2792 
2793    tu_cs_emit_regs(&cs,
2794                    A6XX_VPC_POLYGON_MODE(mode));
2795 
2796    tu_cs_emit_regs(&cs,
2797                    A6XX_PC_POLYGON_MODE(mode));
2798 
2799    /* move to hw ctx init? */
2800    tu_cs_emit_regs(&cs,
2801                    A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
2802                    A6XX_GRAS_SU_POINT_SIZE(1.0f));
2803 
2804    if (builder->device->physical_device->info->a6xx.has_shading_rate) {
2805       tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00());
2806       tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10());
2807       tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20());
2808       tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30());
2809    }
2810 
2811    /* If samples count couldn't be devised from the subpass, we should emit it here.
2812     * It happens when subpass doesn't use any color/depth attachment.
2813     */
2814    if (builder->emit_msaa_state)
2815       tu6_emit_msaa(&cs, builder->samples, pipeline->line_mode);
2816 
2817    const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
2818       vk_find_struct_const(rast_info->pNext,
2819                            PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
2820    unsigned stream = stream_info ? stream_info->rasterizationStream : 0;
2821 
2822    pipeline->pc_raster_cntl = A6XX_PC_RASTER_CNTL_STREAM(stream);
2823    pipeline->vpc_unknown_9107 = 0;
2824    if (rast_info->rasterizerDiscardEnable) {
2825       pipeline->pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD;
2826       pipeline->vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
2827    }
2828 
2829    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4)) {
2830       tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = pipeline->pc_raster_cntl));
2831       tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = pipeline->vpc_unknown_9107));
2832    }
2833 
2834    pipeline->gras_su_cntl =
2835       tu6_gras_su_cntl(rast_info, pipeline->line_mode, builder->multiview_mask != 0);
2836 
2837    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2))
2838       tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->gras_su_cntl));
2839 
2840    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) {
2841       tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor,
2842                           rast_info->depthBiasClamp,
2843                           rast_info->depthBiasSlopeFactor);
2844    }
2845 
2846    const struct VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_state =
2847       vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
2848    pipeline->provoking_vertex_last = provoking_vtx_state &&
2849       provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT;
2850 }
2851 
2852 static void
tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2853 tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
2854                                         struct tu_pipeline *pipeline)
2855 {
2856    /* The spec says:
2857     *
2858     *    pDepthStencilState is a pointer to an instance of the
2859     *    VkPipelineDepthStencilStateCreateInfo structure, and is ignored if
2860     *    the pipeline has rasterization disabled or if the subpass of the
2861     *    render pass the pipeline is created against does not use a
2862     *    depth/stencil attachment.
2863     */
2864    const VkPipelineDepthStencilStateCreateInfo *ds_info =
2865       builder->create_info->pDepthStencilState;
2866    const VkPipelineRasterizationStateCreateInfo *rast_info =
2867       builder->create_info->pRasterizationState;
2868    uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
2869    struct tu_cs cs;
2870 
2871    if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
2872        builder->depth_attachment_format != VK_FORMAT_S8_UINT) {
2873       if (ds_info->depthTestEnable) {
2874          rb_depth_cntl |=
2875             A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
2876             A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) |
2877             A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; /* TODO: don't set for ALWAYS/NEVER */
2878 
2879          if (rast_info->depthClampEnable)
2880             rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE;
2881 
2882          if (ds_info->depthWriteEnable)
2883             rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2884       }
2885 
2886       if (ds_info->depthBoundsTestEnable)
2887          rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
2888 
2889       if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable)
2890          tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl);
2891    } else {
2892       /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set
2893        * to 0 when this pipeline is used, as enabling depth test when there
2894        * is no depth attachment is a problem (at least for the S8_UINT case)
2895        */
2896       if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL))
2897          pipeline->rb_depth_cntl_disable = true;
2898    }
2899 
2900    if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
2901       const VkStencilOpState *front = &ds_info->front;
2902       const VkStencilOpState *back = &ds_info->back;
2903 
2904       rb_stencil_cntl |=
2905          A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) |
2906          A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) |
2907          A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) |
2908          A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) |
2909          A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) |
2910          A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) |
2911          A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) |
2912          A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp));
2913 
2914       if (ds_info->stencilTestEnable) {
2915          rb_stencil_cntl |=
2916             A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2917             A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2918             A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
2919       }
2920    }
2921 
2922    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
2923       tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1);
2924       tu_cs_emit(&cs, rb_depth_cntl);
2925    }
2926    pipeline->rb_depth_cntl = rb_depth_cntl;
2927 
2928    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) {
2929       tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1);
2930       tu_cs_emit(&cs, rb_stencil_cntl);
2931    }
2932    pipeline->rb_stencil_cntl = rb_stencil_cntl;
2933 
2934    /* the remaining draw states arent used if there is no d/s, leave them empty */
2935    if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED)
2936       return;
2937 
2938    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) {
2939       tu_cs_emit_regs(&cs,
2940                       A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds),
2941                       A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds));
2942    }
2943 
2944    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) {
2945       tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff,
2946                                                .bfmask = ds_info->back.compareMask & 0xff));
2947    }
2948 
2949    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) {
2950       update_stencil_mask(&pipeline->stencil_wrmask,  VK_STENCIL_FACE_FRONT_BIT, ds_info->front.writeMask);
2951       update_stencil_mask(&pipeline->stencil_wrmask,  VK_STENCIL_FACE_BACK_BIT, ds_info->back.writeMask);
2952       tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = pipeline->stencil_wrmask));
2953    }
2954 
2955    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) {
2956       tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff,
2957                                               .bfref = ds_info->back.reference & 0xff));
2958    }
2959 
2960    if (builder->shaders[MESA_SHADER_FRAGMENT]) {
2961       const struct ir3_shader_variant *fs = &builder->shaders[MESA_SHADER_FRAGMENT]->ir3_shader->variants[0];
2962       if (fs->has_kill || fs->no_earlyz || fs->writes_pos) {
2963          pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
2964       }
2965       if (fs->no_earlyz || fs->writes_pos) {
2966          pipeline->lrz.force_disable_mask = TU_LRZ_FORCE_DISABLE_LRZ;
2967       }
2968    }
2969 }
2970 
2971 static void
tu_pipeline_builder_parse_multisample_and_color_blend(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2972 tu_pipeline_builder_parse_multisample_and_color_blend(
2973    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
2974 {
2975    /* The spec says:
2976     *
2977     *    pMultisampleState is a pointer to an instance of the
2978     *    VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
2979     *    has rasterization disabled.
2980     *
2981     * Also,
2982     *
2983     *    pColorBlendState is a pointer to an instance of the
2984     *    VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
2985     *    pipeline has rasterization disabled or if the subpass of the render
2986     *    pass the pipeline is created against does not use any color
2987     *    attachments.
2988     *
2989     * We leave the relevant registers stale when rasterization is disabled.
2990     */
2991    if (builder->rasterizer_discard)
2992       return;
2993 
2994    static const VkPipelineColorBlendStateCreateInfo dummy_blend_info;
2995    const VkPipelineMultisampleStateCreateInfo *msaa_info =
2996       builder->create_info->pMultisampleState;
2997    const VkPipelineColorBlendStateCreateInfo *blend_info =
2998       builder->use_color_attachments ? builder->create_info->pColorBlendState
2999                                      : &dummy_blend_info;
3000 
3001    struct tu_cs cs;
3002    pipeline->blend_state =
3003       tu_cs_draw_state(&pipeline->cs, &cs, blend_info->attachmentCount * 3 + 4);
3004 
3005    uint32_t blend_enable_mask;
3006    tu6_emit_rb_mrt_controls(&cs, blend_info,
3007                             builder->color_attachment_formats,
3008                             &blend_enable_mask);
3009 
3010    tu6_emit_blend_control(&cs, blend_enable_mask,
3011                           builder->use_dual_src_blend, msaa_info);
3012 
3013    assert(cs.cur == cs.end); /* validate draw state size */
3014 
3015    if (blend_enable_mask) {
3016       for (int i = 0; i < blend_info->attachmentCount; i++) {
3017          VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i];
3018          /* Disable LRZ writes when blend is enabled, since the
3019           * resulting pixel value from the blend-draw
3020           * depends on an earlier draw, which LRZ in the draw pass
3021           * could early-reject if the previous blend-enabled draw wrote LRZ.
3022           *
3023           * From the PoV of LRZ, having masked color channels is
3024           * the same as having blend enabled, in that the draw will
3025           * care about the fragments from an earlier draw.
3026           *
3027           * TODO: We need to disable LRZ writes only for the binning pass.
3028           * Therefore, we need to emit it in a separate draw state. We keep
3029           * it disabled for sysmem path as well for the moment.
3030           */
3031          if (blendAttachment.blendEnable || blendAttachment.colorWriteMask != 0xf) {
3032             pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
3033          }
3034       }
3035    }
3036 
3037    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) {
3038       tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
3039       tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4);
3040    }
3041 
3042    const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
3043       vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
3044    const VkSampleLocationsInfoEXT *samp_loc = NULL;
3045 
3046    if (sample_locations && sample_locations->sampleLocationsEnable)
3047       samp_loc = &sample_locations->sampleLocationsInfo;
3048 
3049     if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3050                                  samp_loc ? 9 : 6)) {
3051       tu6_emit_sample_locations(&cs, samp_loc);
3052     }
3053 }
3054 
3055 static void
tu_pipeline_finish(struct tu_pipeline * pipeline,struct tu_device * dev,const VkAllocationCallbacks * alloc)3056 tu_pipeline_finish(struct tu_pipeline *pipeline,
3057                    struct tu_device *dev,
3058                    const VkAllocationCallbacks *alloc)
3059 {
3060    tu_cs_finish(&pipeline->cs);
3061 
3062    if (pipeline->pvtmem_bo.size)
3063       tu_bo_finish(dev, &pipeline->pvtmem_bo);
3064 
3065    ralloc_free(pipeline->executables_mem_ctx);
3066 }
3067 
3068 static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder * builder,struct tu_pipeline ** pipeline)3069 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
3070                           struct tu_pipeline **pipeline)
3071 {
3072    VkResult result;
3073 
3074    *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc,
3075                                 sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE);
3076    if (!*pipeline)
3077       return VK_ERROR_OUT_OF_HOST_MEMORY;
3078 
3079    (*pipeline)->layout = builder->layout;
3080    (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
3081    util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
3082 
3083    /* compile and upload shaders */
3084    result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
3085    if (result != VK_SUCCESS) {
3086       vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3087       return result;
3088    }
3089 
3090    result = tu_pipeline_allocate_cs(builder->device, *pipeline, builder, NULL);
3091    if (result != VK_SUCCESS) {
3092       vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3093       return result;
3094    }
3095 
3096    for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++)
3097       builder->shader_iova[i] = tu_upload_variant(*pipeline, builder->variants[i]);
3098 
3099    builder->binning_vs_iova =
3100       tu_upload_variant(*pipeline, builder->binning_variant);
3101 
3102    /* Setup private memory. Note that because we're sharing the same private
3103     * memory for all stages, all stages must use the same config, or else
3104     * fibers from one stage might overwrite fibers in another.
3105     */
3106 
3107    uint32_t pvtmem_size = 0;
3108    bool per_wave = true;
3109    for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) {
3110       if (builder->variants[i]) {
3111          pvtmem_size = MAX2(pvtmem_size, builder->variants[i]->pvtmem_size);
3112          if (!builder->variants[i]->pvtmem_per_wave)
3113             per_wave = false;
3114       }
3115    }
3116 
3117    if (builder->binning_variant) {
3118       pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size);
3119       if (!builder->binning_variant->pvtmem_per_wave)
3120          per_wave = false;
3121    }
3122 
3123    result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem,
3124                             pvtmem_size, per_wave);
3125    if (result != VK_SUCCESS) {
3126       vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3127       return result;
3128    }
3129 
3130    tu_pipeline_builder_parse_dynamic(builder, *pipeline);
3131    tu_pipeline_builder_parse_shader_stages(builder, *pipeline);
3132    tu_pipeline_builder_parse_vertex_input(builder, *pipeline);
3133    tu_pipeline_builder_parse_input_assembly(builder, *pipeline);
3134    tu_pipeline_builder_parse_tessellation(builder, *pipeline);
3135    tu_pipeline_builder_parse_viewport(builder, *pipeline);
3136    tu_pipeline_builder_parse_rasterization(builder, *pipeline);
3137    tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
3138    tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
3139    tu6_emit_load_state(*pipeline, false);
3140 
3141    /* we should have reserved enough space upfront such that the CS never
3142     * grows
3143     */
3144    assert((*pipeline)->cs.bo_count == 1);
3145 
3146    return VK_SUCCESS;
3147 }
3148 
3149 static void
tu_pipeline_builder_finish(struct tu_pipeline_builder * builder)3150 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
3151 {
3152    for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders); i++) {
3153       if (!builder->shaders[i])
3154          continue;
3155       tu_shader_destroy(builder->device, builder->shaders[i], builder->alloc);
3156    }
3157 }
3158 
3159 static void
tu_pipeline_builder_init_graphics(struct tu_pipeline_builder * builder,struct tu_device * dev,struct tu_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * create_info,const VkAllocationCallbacks * alloc)3160 tu_pipeline_builder_init_graphics(
3161    struct tu_pipeline_builder *builder,
3162    struct tu_device *dev,
3163    struct tu_pipeline_cache *cache,
3164    const VkGraphicsPipelineCreateInfo *create_info,
3165    const VkAllocationCallbacks *alloc)
3166 {
3167    TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout);
3168 
3169    *builder = (struct tu_pipeline_builder) {
3170       .device = dev,
3171       .cache = cache,
3172       .create_info = create_info,
3173       .alloc = alloc,
3174       .layout = layout,
3175    };
3176 
3177    bool rasterizer_discard_dynamic = false;
3178    if (create_info->pDynamicState) {
3179       for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
3180          if (create_info->pDynamicState->pDynamicStates[i] ==
3181                VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT) {
3182             rasterizer_discard_dynamic = true;
3183             break;
3184          }
3185       }
3186    }
3187 
3188    const struct tu_render_pass *pass =
3189       tu_render_pass_from_handle(create_info->renderPass);
3190    const struct tu_subpass *subpass =
3191       &pass->subpasses[create_info->subpass];
3192 
3193    builder->multiview_mask = subpass->multiview_mask;
3194 
3195    builder->rasterizer_discard =
3196       builder->create_info->pRasterizationState->rasterizerDiscardEnable &&
3197       !rasterizer_discard_dynamic;
3198 
3199    /* variableMultisampleRate support */
3200    builder->emit_msaa_state = (subpass->samples == 0) && !builder->rasterizer_discard;
3201 
3202    if (builder->rasterizer_discard) {
3203       builder->samples = VK_SAMPLE_COUNT_1_BIT;
3204    } else {
3205       builder->samples = create_info->pMultisampleState->rasterizationSamples;
3206       builder->alpha_to_coverage = create_info->pMultisampleState->alphaToCoverageEnable;
3207 
3208       const uint32_t a = subpass->depth_stencil_attachment.attachment;
3209       builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ?
3210          pass->attachments[a].format : VK_FORMAT_UNDEFINED;
3211 
3212       assert(subpass->color_count == 0 ||
3213              !create_info->pColorBlendState ||
3214              subpass->color_count == create_info->pColorBlendState->attachmentCount);
3215       builder->color_attachment_count = subpass->color_count;
3216       for (uint32_t i = 0; i < subpass->color_count; i++) {
3217          const uint32_t a = subpass->color_attachments[i].attachment;
3218          if (a == VK_ATTACHMENT_UNUSED)
3219             continue;
3220 
3221          builder->color_attachment_formats[i] = pass->attachments[a].format;
3222          builder->use_color_attachments = true;
3223          builder->render_components |= 0xf << (i * 4);
3224       }
3225 
3226       if (tu_blend_state_is_dual_src(create_info->pColorBlendState)) {
3227          builder->color_attachment_count++;
3228          builder->use_dual_src_blend = true;
3229          /* dual source blending has an extra fs output in the 2nd slot */
3230          if (subpass->color_attachments[0].attachment != VK_ATTACHMENT_UNUSED)
3231             builder->render_components |= 0xf << 4;
3232       }
3233    }
3234 }
3235 
3236 static VkResult
tu_graphics_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)3237 tu_graphics_pipeline_create(VkDevice device,
3238                             VkPipelineCache pipelineCache,
3239                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
3240                             const VkAllocationCallbacks *pAllocator,
3241                             VkPipeline *pPipeline)
3242 {
3243    TU_FROM_HANDLE(tu_device, dev, device);
3244    TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache);
3245 
3246    struct tu_pipeline_builder builder;
3247    tu_pipeline_builder_init_graphics(&builder, dev, cache,
3248                                      pCreateInfo, pAllocator);
3249 
3250    struct tu_pipeline *pipeline = NULL;
3251    VkResult result = tu_pipeline_builder_build(&builder, &pipeline);
3252    tu_pipeline_builder_finish(&builder);
3253 
3254    if (result == VK_SUCCESS)
3255       *pPipeline = tu_pipeline_to_handle(pipeline);
3256    else
3257       *pPipeline = VK_NULL_HANDLE;
3258 
3259    return result;
3260 }
3261 
3262 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateGraphicsPipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)3263 tu_CreateGraphicsPipelines(VkDevice device,
3264                            VkPipelineCache pipelineCache,
3265                            uint32_t count,
3266                            const VkGraphicsPipelineCreateInfo *pCreateInfos,
3267                            const VkAllocationCallbacks *pAllocator,
3268                            VkPipeline *pPipelines)
3269 {
3270    VkResult final_result = VK_SUCCESS;
3271 
3272    for (uint32_t i = 0; i < count; i++) {
3273       VkResult result = tu_graphics_pipeline_create(device, pipelineCache,
3274                                                     &pCreateInfos[i], pAllocator,
3275                                                     &pPipelines[i]);
3276 
3277       if (result != VK_SUCCESS)
3278          final_result = result;
3279    }
3280 
3281    return final_result;
3282 }
3283 
3284 static VkResult
tu_compute_pipeline_create(VkDevice device,VkPipelineCache _cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)3285 tu_compute_pipeline_create(VkDevice device,
3286                            VkPipelineCache _cache,
3287                            const VkComputePipelineCreateInfo *pCreateInfo,
3288                            const VkAllocationCallbacks *pAllocator,
3289                            VkPipeline *pPipeline)
3290 {
3291    TU_FROM_HANDLE(tu_device, dev, device);
3292    TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
3293    const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
3294    VkResult result;
3295 
3296    struct tu_pipeline *pipeline;
3297 
3298    *pPipeline = VK_NULL_HANDLE;
3299 
3300    pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline),
3301                                VK_OBJECT_TYPE_PIPELINE);
3302    if (!pipeline)
3303       return VK_ERROR_OUT_OF_HOST_MEMORY;
3304 
3305    pipeline->layout = layout;
3306 
3307    pipeline->executables_mem_ctx = ralloc_context(NULL);
3308    util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx);
3309 
3310    struct ir3_shader_key key = {};
3311 
3312    nir_shader *nir = tu_spirv_to_nir(dev, stage_info, MESA_SHADER_COMPUTE);
3313 
3314    const bool executable_info = pCreateInfo->flags &
3315       VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
3316 
3317    char *nir_initial_disasm = executable_info ?
3318       nir_shader_as_str(nir, pipeline->executables_mem_ctx) : NULL;
3319 
3320    struct tu_shader *shader =
3321       tu_shader_create(dev, nir, 0, layout, pAllocator);
3322    if (!shader) {
3323       result = VK_ERROR_OUT_OF_HOST_MEMORY;
3324       goto fail;
3325    }
3326 
3327    pipeline->active_desc_sets = shader->active_desc_sets;
3328 
3329    bool created;
3330    struct ir3_shader_variant *v =
3331       ir3_shader_get_variant(shader->ir3_shader, &key, false, executable_info, &created);
3332    if (!v) {
3333       result = VK_ERROR_OUT_OF_HOST_MEMORY;
3334       goto fail;
3335    }
3336 
3337    tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE],
3338                            shader, v);
3339 
3340    result = tu_pipeline_allocate_cs(dev, pipeline, NULL, v);
3341    if (result != VK_SUCCESS)
3342       goto fail;
3343 
3344    uint64_t shader_iova = tu_upload_variant(pipeline, v);
3345 
3346    struct tu_pvtmem_config pvtmem;
3347    tu_setup_pvtmem(dev, pipeline, &pvtmem, v->pvtmem_size, v->pvtmem_per_wave);
3348 
3349    for (int i = 0; i < 3; i++)
3350       pipeline->compute.local_size[i] = v->local_size[i];
3351 
3352    pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64;
3353 
3354    struct tu_cs prog_cs;
3355    uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v);
3356    tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs);
3357    tu6_emit_cs_config(&prog_cs, shader, v, &pvtmem, shader_iova);
3358    pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
3359 
3360    tu6_emit_load_state(pipeline, true);
3361 
3362    tu_append_executable(pipeline, v, nir_initial_disasm);
3363 
3364    tu_shader_destroy(dev, shader, pAllocator);
3365 
3366    *pPipeline = tu_pipeline_to_handle(pipeline);
3367 
3368    return VK_SUCCESS;
3369 
3370 fail:
3371    if (shader)
3372       tu_shader_destroy(dev, shader, pAllocator);
3373 
3374    vk_object_free(&dev->vk, pAllocator, pipeline);
3375 
3376    return result;
3377 }
3378 
3379 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateComputePipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)3380 tu_CreateComputePipelines(VkDevice device,
3381                           VkPipelineCache pipelineCache,
3382                           uint32_t count,
3383                           const VkComputePipelineCreateInfo *pCreateInfos,
3384                           const VkAllocationCallbacks *pAllocator,
3385                           VkPipeline *pPipelines)
3386 {
3387    VkResult final_result = VK_SUCCESS;
3388 
3389    for (uint32_t i = 0; i < count; i++) {
3390       VkResult result = tu_compute_pipeline_create(device, pipelineCache,
3391                                                    &pCreateInfos[i],
3392                                                    pAllocator, &pPipelines[i]);
3393       if (result != VK_SUCCESS)
3394          final_result = result;
3395    }
3396 
3397    return final_result;
3398 }
3399 
3400 VKAPI_ATTR void VKAPI_CALL
tu_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)3401 tu_DestroyPipeline(VkDevice _device,
3402                    VkPipeline _pipeline,
3403                    const VkAllocationCallbacks *pAllocator)
3404 {
3405    TU_FROM_HANDLE(tu_device, dev, _device);
3406    TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
3407 
3408    if (!_pipeline)
3409       return;
3410 
3411    tu_pipeline_finish(pipeline, dev, pAllocator);
3412    vk_object_free(&dev->vk, pAllocator, pipeline);
3413 }
3414 
3415 #define WRITE_STR(field, ...) ({                                \
3416    memset(field, 0, sizeof(field));                             \
3417    UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
3418    assert(_i > 0 && _i < sizeof(field));                        \
3419 })
3420 
3421 static const struct tu_pipeline_executable *
tu_pipeline_get_executable(struct tu_pipeline * pipeline,uint32_t index)3422 tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
3423 {
3424    assert(index < util_dynarray_num_elements(&pipeline->executables,
3425                                              struct tu_pipeline_executable));
3426    return util_dynarray_element(
3427       &pipeline->executables, struct tu_pipeline_executable, index);
3428 }
3429 
3430 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)3431 tu_GetPipelineExecutablePropertiesKHR(
3432       VkDevice _device,
3433       const VkPipelineInfoKHR* pPipelineInfo,
3434       uint32_t* pExecutableCount,
3435       VkPipelineExecutablePropertiesKHR* pProperties)
3436 {
3437    TU_FROM_HANDLE(tu_device, dev, _device);
3438    TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
3439    VK_OUTARRAY_MAKE(out, pProperties, pExecutableCount);
3440 
3441    util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
3442       vk_outarray_append(&out, props) {
3443          gl_shader_stage stage = exe->stage;
3444          props->stages = mesa_to_vk_shader_stage(stage);
3445 
3446          if (!exe->is_binning)
3447             WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
3448          else
3449             WRITE_STR(props->name, "Binning VS");
3450 
3451          WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
3452 
3453          props->subgroupSize =
3454             dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
3455       }
3456    }
3457 
3458    return vk_outarray_status(&out);
3459 }
3460 
3461 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)3462 tu_GetPipelineExecutableStatisticsKHR(
3463       VkDevice _device,
3464       const VkPipelineExecutableInfoKHR* pExecutableInfo,
3465       uint32_t* pStatisticCount,
3466       VkPipelineExecutableStatisticKHR* pStatistics)
3467 {
3468    TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
3469    VK_OUTARRAY_MAKE(out, pStatistics, pStatisticCount);
3470 
3471    const struct tu_pipeline_executable *exe =
3472       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
3473 
3474    vk_outarray_append(&out, stat) {
3475       WRITE_STR(stat->name, "Max Waves Per Core");
3476       WRITE_STR(stat->description,
3477                 "Maximum number of simultaneous waves per core.");
3478       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3479       stat->value.u64 = exe->stats.max_waves;
3480    }
3481 
3482    vk_outarray_append(&out, stat) {
3483       WRITE_STR(stat->name, "Instruction Count");
3484       WRITE_STR(stat->description,
3485                 "Total number of IR3 instructions in the final generated "
3486                 "shader executable.");
3487       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3488       stat->value.u64 = exe->stats.instrs_count;
3489    }
3490 
3491    vk_outarray_append(&out, stat) {
3492       WRITE_STR(stat->name, "NOPs Count");
3493       WRITE_STR(stat->description,
3494                 "Number of NOP instructions in the final generated "
3495                 "shader executable.");
3496       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3497       stat->value.u64 = exe->stats.nops_count;
3498    }
3499 
3500    vk_outarray_append(&out, stat) {
3501       WRITE_STR(stat->name, "MOV Count");
3502       WRITE_STR(stat->description,
3503                 "Number of MOV instructions in the final generated "
3504                 "shader executable.");
3505       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3506       stat->value.u64 = exe->stats.mov_count;
3507    }
3508 
3509    vk_outarray_append(&out, stat) {
3510       WRITE_STR(stat->name, "COV Count");
3511       WRITE_STR(stat->description,
3512                 "Number of COV instructions in the final generated "
3513                 "shader executable.");
3514       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3515       stat->value.u64 = exe->stats.cov_count;
3516    }
3517 
3518    vk_outarray_append(&out, stat) {
3519       WRITE_STR(stat->name, "Registers used");
3520       WRITE_STR(stat->description,
3521                 "Number of registers used in the final generated "
3522                 "shader executable.");
3523       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3524       stat->value.u64 = exe->stats.max_reg + 1;
3525    }
3526 
3527    vk_outarray_append(&out, stat) {
3528       WRITE_STR(stat->name, "Half-registers used");
3529       WRITE_STR(stat->description,
3530                 "Number of half-registers used in the final generated "
3531                 "shader executable.");
3532       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3533       stat->value.u64 = exe->stats.max_half_reg + 1;
3534    }
3535 
3536    vk_outarray_append(&out, stat) {
3537       WRITE_STR(stat->name, "Instructions with SS sync bit");
3538       WRITE_STR(stat->description,
3539                 "SS bit is set for instructions which depend on a result "
3540                 "of \"long\" instructions to prevent RAW hazard.");
3541       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3542       stat->value.u64 = exe->stats.ss;
3543    }
3544 
3545    vk_outarray_append(&out, stat) {
3546       WRITE_STR(stat->name, "Instructions with SY sync bit");
3547       WRITE_STR(stat->description,
3548                 "SY bit is set for instructions which depend on a result "
3549                 "of loads from global memory to prevent RAW hazard.");
3550       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3551       stat->value.u64 = exe->stats.sy;
3552    }
3553 
3554    vk_outarray_append(&out, stat) {
3555       WRITE_STR(stat->name, "Estimated cycles stalled on SS");
3556       WRITE_STR(stat->description,
3557                 "A better metric to estimate the impact of SS syncs.");
3558       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3559       stat->value.u64 = exe->stats.sstall;
3560    }
3561 
3562    for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
3563       vk_outarray_append(&out, stat) {
3564          WRITE_STR(stat->name, "cat%d instructions", i);
3565          WRITE_STR(stat->description,
3566                   "Number of cat%d instructions.", i);
3567          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3568          stat->value.u64 = exe->stats.instrs_per_cat[i];
3569       }
3570    }
3571 
3572    vk_outarray_append(&out, stat) {
3573       WRITE_STR(stat->name, "STP Count");
3574       WRITE_STR(stat->description,
3575                 "Number of STore Private instructions in the final generated "
3576                 "shader executable.");
3577       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3578       stat->value.u64 = exe->stats.stp_count;
3579    }
3580 
3581    vk_outarray_append(&out, stat) {
3582       WRITE_STR(stat->name, "LDP Count");
3583       WRITE_STR(stat->description,
3584                 "Number of LoaD Private instructions in the final generated "
3585                 "shader executable.");
3586       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3587       stat->value.u64 = exe->stats.ldp_count;
3588    }
3589 
3590    return vk_outarray_status(&out);
3591 }
3592 
3593 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)3594 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
3595               const char *data)
3596 {
3597    ir->isText = VK_TRUE;
3598 
3599    size_t data_len = strlen(data) + 1;
3600 
3601    if (ir->pData == NULL) {
3602       ir->dataSize = data_len;
3603       return true;
3604    }
3605 
3606    strncpy(ir->pData, data, ir->dataSize);
3607    if (ir->dataSize < data_len)
3608       return false;
3609 
3610    ir->dataSize = data_len;
3611    return true;
3612 }
3613 
3614 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)3615 tu_GetPipelineExecutableInternalRepresentationsKHR(
3616     VkDevice _device,
3617     const VkPipelineExecutableInfoKHR* pExecutableInfo,
3618     uint32_t* pInternalRepresentationCount,
3619     VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
3620 {
3621    TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
3622    VK_OUTARRAY_MAKE(out, pInternalRepresentations, pInternalRepresentationCount);
3623    bool incomplete_text = false;
3624 
3625    const struct tu_pipeline_executable *exe =
3626       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
3627 
3628    if (exe->nir_from_spirv) {
3629       vk_outarray_append(&out, ir) {
3630          WRITE_STR(ir->name, "NIR from SPIRV");
3631          WRITE_STR(ir->description,
3632                    "Initial NIR before any optimizations");
3633 
3634          if (!write_ir_text(ir, exe->nir_from_spirv))
3635             incomplete_text = true;
3636       }
3637    }
3638 
3639    if (exe->nir_final) {
3640       vk_outarray_append(&out, ir) {
3641          WRITE_STR(ir->name, "Final NIR");
3642          WRITE_STR(ir->description,
3643                    "Final NIR before going into the back-end compiler");
3644 
3645          if (!write_ir_text(ir, exe->nir_final))
3646             incomplete_text = true;
3647       }
3648    }
3649 
3650    if (exe->disasm) {
3651       vk_outarray_append(&out, ir) {
3652          WRITE_STR(ir->name, "IR3 Assembly");
3653          WRITE_STR(ir->description,
3654                    "Final IR3 assembly for the generated shader binary");
3655 
3656          if (!write_ir_text(ir, exe->disasm))
3657             incomplete_text = true;
3658       }
3659    }
3660 
3661    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
3662 }
3663