1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "ac_nir.h"
26 #include "ac_rtld.h"
27 #include "nir.h"
28 #include "nir_serialize.h"
29 #include "si_pipe.h"
30 #include "si_shader_internal.h"
31 #include "sid.h"
32 #include "tgsi/tgsi_from_mesa.h"
33 #include "tgsi/tgsi_strings.h"
34 #include "util/u_memory.h"
35 #include "util/mesa-sha1.h"
36 
37 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
38 
39 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
40 
41 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
42 
43 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)44 bool si_is_multi_part_shader(struct si_shader *shader)
45 {
46    if (shader->selector->screen->info.chip_class <= GFX8 ||
47        shader->selector->info.stage > MESA_SHADER_GEOMETRY)
48       return false;
49 
50    return shader->key.ge.as_ls || shader->key.ge.as_es ||
51           shader->selector->info.stage == MESA_SHADER_TESS_CTRL ||
52           shader->selector->info.stage == MESA_SHADER_GEOMETRY;
53 }
54 
55 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)56 bool si_is_merged_shader(struct si_shader *shader)
57 {
58    if (shader->selector->info.stage > MESA_SHADER_GEOMETRY)
59       return false;
60 
61    return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
62 }
63 
64 /**
65  * Returns a unique index for a per-patch semantic name and index. The index
66  * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
67  * can be calculated.
68  */
si_shader_io_get_unique_index_patch(unsigned semantic)69 unsigned si_shader_io_get_unique_index_patch(unsigned semantic)
70 {
71    switch (semantic) {
72    case VARYING_SLOT_TESS_LEVEL_OUTER:
73       return 0;
74    case VARYING_SLOT_TESS_LEVEL_INNER:
75       return 1;
76    default:
77       if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)
78          return 2 + (semantic - VARYING_SLOT_PATCH0);
79 
80       assert(!"invalid semantic");
81       return 0;
82    }
83 }
84 
85 /**
86  * Returns a unique index for a semantic name and index. The index must be
87  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
88  * calculated.
89  */
si_shader_io_get_unique_index(unsigned semantic,bool is_varying)90 unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying)
91 {
92    switch (semantic) {
93    case VARYING_SLOT_POS:
94       return 0;
95    default:
96       /* Since some shader stages use the highest used IO index
97        * to determine the size to allocate for inputs/outputs
98        * (in LDS, tess and GS rings). GENERIC should be placed right
99        * after POSITION to make that size as small as possible.
100        */
101       if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
102          return 1 + (semantic - VARYING_SLOT_VAR0); /* 1..32 */
103 
104       /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
105        * legacy desktop GL varyings because they are mutually exclusive.
106        */
107       if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
108          return 33 + (semantic - VARYING_SLOT_VAR0_16BIT); /* 33..48 */
109 
110       assert(!"invalid generic index");
111       return 0;
112 
113    /* Legacy desktop GL varyings. */
114    case VARYING_SLOT_FOGC:
115       return 33;
116    case VARYING_SLOT_COL0:
117       return 34;
118    case VARYING_SLOT_COL1:
119       return 35;
120    case VARYING_SLOT_BFC0:
121       /* If it's a varying, COLOR and BCOLOR alias. */
122       if (is_varying)
123          return 34;
124       else
125          return 36;
126    case VARYING_SLOT_BFC1:
127       if (is_varying)
128          return 35;
129       else
130          return 37;
131    case VARYING_SLOT_TEX0:
132    case VARYING_SLOT_TEX1:
133    case VARYING_SLOT_TEX2:
134    case VARYING_SLOT_TEX3:
135    case VARYING_SLOT_TEX4:
136    case VARYING_SLOT_TEX5:
137    case VARYING_SLOT_TEX6:
138    case VARYING_SLOT_TEX7:
139       return 38 + (semantic - VARYING_SLOT_TEX0);
140    case VARYING_SLOT_CLIP_VERTEX:
141       return 46;
142 
143    /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
144    case VARYING_SLOT_CLIP_DIST0:
145       return 49;
146    case VARYING_SLOT_CLIP_DIST1:
147       return 50;
148    case VARYING_SLOT_PSIZ:
149       return 51;
150 
151    /* These can't be written by LS, HS, and ES. */
152    case VARYING_SLOT_LAYER:
153       return 52;
154    case VARYING_SLOT_VIEWPORT:
155       return 53;
156    case VARYING_SLOT_PRIMITIVE_ID:
157       return 54;
158    }
159 }
160 
si_dump_streamout(struct pipe_stream_output_info * so)161 static void si_dump_streamout(struct pipe_stream_output_info *so)
162 {
163    unsigned i;
164 
165    if (so->num_outputs) {
166       fprintf(stderr, "STREAMOUT\n");
167 
168       fprintf(stderr, "  STRIDES: {");
169       for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++)
170          fprintf(stderr, "%u%s", so->stride[i], i < 3 ? ", " : "");
171       fprintf(stderr, "}\n");
172    }
173 
174    for (i = 0; i < so->num_outputs; i++) {
175       unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
176       fprintf(stderr, "  %i: STREAM%u: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
177               i, so->output[i].stream, so->output[i].output_buffer,
178               so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
179               so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
180               mask & 4 ? "z" : "", mask & 8 ? "w" : "");
181    }
182 }
183 
declare_streamout_params(struct si_shader_context * ctx,struct pipe_stream_output_info * so)184 static void declare_streamout_params(struct si_shader_context *ctx,
185                                      struct pipe_stream_output_info *so)
186 {
187    if (ctx->screen->use_ngg_streamout) {
188       if (ctx->stage == MESA_SHADER_TESS_EVAL)
189          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
190       return;
191    }
192 
193    /* Streamout SGPRs. */
194    if (so->num_outputs) {
195       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_config);
196       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_write_index);
197    } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
198       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
199    }
200 
201    /* A streamout buffer offset is loaded if the stride is non-zero. */
202    for (int i = 0; i < 4; i++) {
203       if (!so->stride[i])
204          continue;
205 
206       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_offset[i]);
207    }
208 }
209 
si_get_max_workgroup_size(const struct si_shader * shader)210 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
211 {
212    switch (shader->selector->info.stage) {
213    case MESA_SHADER_VERTEX:
214    case MESA_SHADER_TESS_EVAL:
215       return shader->key.ge.as_ngg ? 128 : 0;
216 
217    case MESA_SHADER_TESS_CTRL:
218       /* Return this so that LLVM doesn't remove s_barrier
219        * instructions on chips where we use s_barrier. */
220       return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
221 
222    case MESA_SHADER_GEOMETRY:
223       return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
224 
225    case MESA_SHADER_COMPUTE:
226       break; /* see below */
227 
228    default:
229       return 0;
230    }
231 
232    /* Compile a variable block size using the maximum variable size. */
233    if (shader->selector->info.base.workgroup_size_variable)
234       return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
235 
236    uint16_t *local_size = shader->selector->info.base.workgroup_size;
237    unsigned max_work_group_size = (uint32_t)local_size[0] *
238                                   (uint32_t)local_size[1] *
239                                   (uint32_t)local_size[2];
240    assert(max_work_group_size);
241    return max_work_group_size;
242 }
243 
declare_const_and_shader_buffers(struct si_shader_context * ctx,bool assign_params)244 static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
245 {
246    enum ac_arg_type const_shader_buf_type;
247 
248    if (ctx->shader->selector->info.base.num_ubos == 1 &&
249        ctx->shader->selector->info.base.num_ssbos == 0)
250       const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
251    else
252       const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
253 
254    ac_add_arg(
255       &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
256       assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
257 }
258 
declare_samplers_and_images(struct si_shader_context * ctx,bool assign_params)259 static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
260 {
261    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
262               assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
263 }
264 
declare_per_stage_desc_pointers(struct si_shader_context * ctx,bool assign_params)265 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
266 {
267    declare_const_and_shader_buffers(ctx, assign_params);
268    declare_samplers_and_images(ctx, assign_params);
269 }
270 
declare_global_desc_pointers(struct si_shader_context * ctx)271 static void declare_global_desc_pointers(struct si_shader_context *ctx)
272 {
273    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->internal_bindings);
274    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
275               &ctx->bindless_samplers_and_images);
276 }
277 
declare_vb_descriptor_input_sgprs(struct si_shader_context * ctx)278 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
279 {
280    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->args.vertex_buffers);
281 
282    unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
283    if (num_vbos_in_user_sgprs) {
284       unsigned user_sgprs = ctx->args.num_sgprs_used;
285 
286       if (si_is_merged_shader(ctx->shader))
287          user_sgprs -= 8;
288       assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
289 
290       /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
291       for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
292          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
293 
294       assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
295       for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
296          ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
297    }
298 }
299 
declare_vs_input_vgprs(struct si_shader_context * ctx,unsigned * num_prolog_vgprs)300 static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs)
301 {
302    struct si_shader *shader = ctx->shader;
303 
304    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
305    if (shader->key.ge.as_ls) {
306       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id);
307       if (ctx->screen->info.chip_class >= GFX10) {
308          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
309          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
310       } else {
311          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
312          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
313       }
314    } else if (ctx->screen->info.chip_class >= GFX10) {
315       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
316       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
317                  &ctx->args.vs_prim_id); /* user vgpr or PrimID (legacy) */
318       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
319    } else {
320       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
321       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_prim_id);
322       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
323    }
324 
325    if (!shader->is_gs_copy_shader) {
326       /* Vertex load indices. */
327       if (shader->selector->info.num_inputs) {
328          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
329          for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
330             ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
331       }
332       *num_prolog_vgprs += shader->selector->info.num_inputs;
333    }
334 }
335 
declare_vs_blit_inputs(struct si_shader_context * ctx,unsigned vs_blit_property)336 static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
337 {
338    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
339    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);                 /* i16 x1, y1 */
340    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL);               /* depth */
341 
342    if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
343       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
344       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
345       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
346       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
347    } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
348       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
349       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
350       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
351       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
352       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
353       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
354    }
355 }
356 
declare_tes_input_vgprs(struct si_shader_context * ctx)357 static void declare_tes_input_vgprs(struct si_shader_context *ctx)
358 {
359    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_u);
360    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_v);
361    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_rel_patch_id);
362    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
363 }
364 
365 enum
366 {
367    /* Convenient merged shader definitions. */
368    SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
369    SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
370 };
371 
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)372 void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
373                         enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
374 {
375    assert(args->arg_count == idx);
376    ac_add_arg(args, file, registers, type, arg);
377 }
378 
si_init_shader_args(struct si_shader_context * ctx,bool ngg_cull_shader)379 void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
380 {
381    struct si_shader *shader = ctx->shader;
382    unsigned i, num_returns, num_return_sgprs;
383    unsigned num_prolog_vgprs = 0;
384    unsigned stage = ctx->stage;
385 
386    memset(&ctx->args, 0, sizeof(ctx->args));
387 
388    /* Set MERGED shaders. */
389    if (ctx->screen->info.chip_class >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
390       if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
391          stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
392       else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
393          stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
394    }
395 
396    switch (stage) {
397    case MESA_SHADER_VERTEX:
398       declare_global_desc_pointers(ctx);
399 
400       if (shader->selector->info.base.vs.blit_sgprs_amd) {
401          declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
402 
403          /* VGPRs */
404          declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
405          break;
406       }
407 
408       declare_per_stage_desc_pointers(ctx, true);
409       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
410 
411       if (ctx->shader->is_gs_copy_shader) {
412          declare_streamout_params(ctx, &shader->selector->so);
413          /* VGPRs */
414          declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
415          break;
416       }
417 
418       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
419       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
420       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
421       declare_vb_descriptor_input_sgprs(ctx);
422 
423       if (shader->key.ge.as_es) {
424          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);
425       } else if (shader->key.ge.as_ls) {
426          /* no extra parameters */
427       } else {
428          declare_streamout_params(ctx, &shader->selector->so);
429       }
430 
431       /* VGPRs */
432       declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
433       break;
434 
435    case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
436       declare_global_desc_pointers(ctx);
437       declare_per_stage_desc_pointers(ctx, true);
438       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
439       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
440       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
441       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
442       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
443       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
444 
445       /* VGPRs */
446       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
447       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
448 
449       /* param_tcs_offchip_offset and param_tcs_factor_offset are
450        * placed after the user SGPRs.
451        */
452       for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
453          ac_add_return(&ctx->args, AC_ARG_SGPR);
454       for (i = 0; i < 11; i++)
455          ac_add_return(&ctx->args, AC_ARG_VGPR);
456       break;
457 
458    case SI_SHADER_MERGED_VERTEX_TESSCTRL:
459       /* Merged stages have 8 system SGPRs at the beginning. */
460       /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
461       declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL);
462       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
463       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);
464       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
465       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);
466       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
467       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
468 
469       declare_global_desc_pointers(ctx);
470       declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX);
471 
472       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
473       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
474       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
475       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
476       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
477       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
478       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
479       if (ctx->stage == MESA_SHADER_VERTEX)
480          declare_vb_descriptor_input_sgprs(ctx);
481 
482       /* VGPRs (first TCS, then VS) */
483       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
484       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
485 
486       if (ctx->stage == MESA_SHADER_VERTEX) {
487          declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
488 
489          /* LS return values are inputs to the TCS main shader part. */
490          for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
491             ac_add_return(&ctx->args, AC_ARG_SGPR);
492          for (i = 0; i < 2; i++)
493             ac_add_return(&ctx->args, AC_ARG_VGPR);
494 
495          /* VS outputs passed via VGPRs to TCS. */
496          if (shader->key.ge.opt.same_patch_vertices) {
497             unsigned num_outputs = util_last_bit64(shader->selector->outputs_written);
498             for (i = 0; i < num_outputs * 4; i++)
499                ac_add_return(&ctx->args, AC_ARG_VGPR);
500          }
501       } else {
502          /* TCS inputs are passed via VGPRs from VS. */
503          if (shader->key.ge.opt.same_patch_vertices) {
504             unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->outputs_written);
505             for (i = 0; i < num_inputs * 4; i++)
506                ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
507          }
508 
509          /* TCS return values are inputs to the TCS epilog.
510           *
511           * param_tcs_offchip_offset, param_tcs_factor_offset,
512           * param_tcs_offchip_layout, and internal_bindings
513           * should be passed to the epilog.
514           */
515          for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
516             ac_add_return(&ctx->args, AC_ARG_SGPR);
517          for (i = 0; i < 11; i++)
518             ac_add_return(&ctx->args, AC_ARG_VGPR);
519       }
520       break;
521 
522    case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
523       /* Merged stages have 8 system SGPRs at the beginning. */
524       /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
525       declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY);
526 
527       if (ctx->shader->key.ge.as_ngg)
528          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_tg_info);
529       else
530          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);
531 
532       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);
533       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
534       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);
535       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
536       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
537 
538       declare_global_desc_pointers(ctx);
539       if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) {
540          declare_per_stage_desc_pointers(
541             ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL));
542       }
543 
544       if (ctx->stage == MESA_SHADER_VERTEX && shader->selector->info.base.vs.blit_sgprs_amd) {
545          declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
546       } else {
547          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
548 
549          if (ctx->stage == MESA_SHADER_VERTEX) {
550             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
551             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
552             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
553          } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
554             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
555             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
556             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
557          } else {
558             /* GS */
559             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
560             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
561             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
562          }
563 
564          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->small_prim_cull_info);
565 
566          if (ctx->stage == MESA_SHADER_VERTEX)
567             declare_vb_descriptor_input_sgprs(ctx);
568       }
569 
570       /* VGPRs (first GS, then VS/TES) */
571       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
572       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
573       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
574       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
575       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
576 
577       if (ctx->stage == MESA_SHADER_VERTEX) {
578          declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
579       } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
580          declare_tes_input_vgprs(ctx);
581       }
582 
583       if ((ctx->shader->key.ge.as_es || ngg_cull_shader) &&
584           (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
585          unsigned num_user_sgprs, num_vgprs;
586 
587          if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) {
588             /* For the NGG cull shader, add 1 SGPR to hold
589              * the vertex buffer pointer.
590              */
591             num_user_sgprs = GFX9_GS_NUM_USER_SGPR + 1;
592 
593             if (shader->selector->num_vbos_in_user_sgprs) {
594                assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
595                num_user_sgprs =
596                   SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
597             }
598          } else {
599             num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
600          }
601 
602          /* The NGG cull shader has to return all 9 VGPRs.
603           *
604           * The normal merged ESGS shader only has to return the 5 VGPRs
605           * for the GS stage.
606           */
607          num_vgprs = ngg_cull_shader ? 9 : 5;
608 
609          /* ES return values are inputs to GS. */
610          for (i = 0; i < 8 + num_user_sgprs; i++)
611             ac_add_return(&ctx->args, AC_ARG_SGPR);
612          for (i = 0; i < num_vgprs; i++)
613             ac_add_return(&ctx->args, AC_ARG_VGPR);
614       }
615       break;
616 
617    case MESA_SHADER_TESS_EVAL:
618       declare_global_desc_pointers(ctx);
619       declare_per_stage_desc_pointers(ctx, true);
620       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
621       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
622       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
623 
624       if (shader->key.ge.as_es) {
625          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
626          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
627          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);
628       } else {
629          declare_streamout_params(ctx, &shader->selector->so);
630          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
631       }
632 
633       /* VGPRs */
634       declare_tes_input_vgprs(ctx);
635       break;
636 
637    case MESA_SHADER_GEOMETRY:
638       declare_global_desc_pointers(ctx);
639       declare_per_stage_desc_pointers(ctx, true);
640       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);
641       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_wave_id);
642 
643       /* VGPRs */
644       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
645       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
646       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
647       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
648       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[3]);
649       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[4]);
650       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[5]);
651       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
652       break;
653 
654    case MESA_SHADER_FRAGMENT:
655       declare_global_desc_pointers(ctx);
656       declare_per_stage_desc_pointers(ctx, true);
657       si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
658       si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
659                          SI_PARAM_PRIM_MASK);
660 
661       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
662                          SI_PARAM_PERSP_SAMPLE);
663       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
664                          SI_PARAM_PERSP_CENTER);
665       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
666                          SI_PARAM_PERSP_CENTROID);
667       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
668       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
669                          SI_PARAM_LINEAR_SAMPLE);
670       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
671                          SI_PARAM_LINEAR_CENTER);
672       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
673                          SI_PARAM_LINEAR_CENTROID);
674       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
675       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
676                          SI_PARAM_POS_X_FLOAT);
677       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
678                          SI_PARAM_POS_Y_FLOAT);
679       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
680                          SI_PARAM_POS_Z_FLOAT);
681       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
682                          SI_PARAM_POS_W_FLOAT);
683       shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
684       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
685                          SI_PARAM_FRONT_FACE);
686       shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
687       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
688                          SI_PARAM_ANCILLARY);
689       shader->info.sample_coverage_vgpr_index = ctx->args.num_vgprs_used;
690       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
691                          SI_PARAM_SAMPLE_COVERAGE);
692       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
693                          SI_PARAM_POS_FIXED_PT);
694 
695       /* Color inputs from the prolog. */
696       if (shader->selector->info.colors_read) {
697          unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
698 
699          for (i = 0; i < num_color_elements; i++)
700             ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
701 
702          num_prolog_vgprs += num_color_elements;
703       }
704 
705       /* Outputs for the epilog. */
706       num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
707       num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
708                     shader->selector->info.writes_z + shader->selector->info.writes_stencil +
709                     shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
710 
711       for (i = 0; i < num_return_sgprs; i++)
712          ac_add_return(&ctx->args, AC_ARG_SGPR);
713       for (; i < num_returns; i++)
714          ac_add_return(&ctx->args, AC_ARG_VGPR);
715       break;
716 
717    case MESA_SHADER_COMPUTE:
718       declare_global_desc_pointers(ctx);
719       declare_per_stage_desc_pointers(ctx, true);
720       if (shader->selector->info.uses_grid_size)
721          ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
722       if (shader->selector->info.uses_variable_block_size)
723          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size);
724 
725       unsigned cs_user_data_dwords =
726          shader->selector->info.base.cs.user_data_components_amd;
727       if (cs_user_data_dwords) {
728          ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
729       }
730 
731       /* Some descriptors can be in user SGPRs. */
732       /* Shader buffers in user SGPRs. */
733       for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
734          while (ctx->args.num_sgprs_used % 4 != 0)
735             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
736 
737          ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]);
738       }
739       /* Images in user SGPRs. */
740       for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
741          unsigned num_sgprs = shader->selector->info.base.image_buffers & (1 << i) ? 4 : 8;
742 
743          while (ctx->args.num_sgprs_used % num_sgprs != 0)
744             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
745 
746          ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]);
747       }
748 
749       /* Hardware SGPRs. */
750       for (i = 0; i < 3; i++) {
751          if (shader->selector->info.uses_block_id[i]) {
752             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
753          }
754       }
755       if (shader->selector->info.uses_subgroup_info)
756          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
757 
758       /* Hardware VGPRs. */
759       if (!ctx->screen->info.has_graphics && ctx->screen->info.family >= CHIP_ALDEBARAN)
760          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.local_invocation_ids);
761       else
762          ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
763       break;
764    default:
765       assert(0 && "unimplemented shader");
766       return;
767    }
768 
769    shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
770    shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
771 
772    assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
773    shader->info.num_input_vgprs -= num_prolog_vgprs;
774 }
775 
776 /* For the UMR disassembler. */
777 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
778 #define DEBUGGER_NUM_MARKERS        5
779 
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)780 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
781                                   struct ac_rtld_binary *rtld)
782 {
783    const struct si_shader_selector *sel = shader->selector;
784    const char *part_elfs[5];
785    size_t part_sizes[5];
786    unsigned num_parts = 0;
787 
788 #define add_part(shader_or_part)                                                                   \
789    if (shader_or_part) {                                                                           \
790       part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer;                                  \
791       part_sizes[num_parts] = (shader_or_part)->binary.elf_size;                                   \
792       num_parts++;                                                                                 \
793    }
794 
795    add_part(shader->prolog);
796    add_part(shader->previous_stage);
797    add_part(shader);
798    add_part(shader->epilog);
799 
800 #undef add_part
801 
802    struct ac_rtld_symbol lds_symbols[2];
803    unsigned num_lds_symbols = 0;
804 
805    if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
806        (sel->info.stage == MESA_SHADER_GEOMETRY ||
807         (sel->info.stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
808       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
809       sym->name = "esgs_ring";
810       sym->size = shader->gs_info.esgs_ring_size * 4;
811       sym->align = 64 * 1024;
812    }
813 
814    if (sel->info.stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
815       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
816       sym->name = "ngg_emit";
817       sym->size = shader->ngg.ngg_emit_size * 4;
818       sym->align = 4;
819    }
820 
821    bool ok = ac_rtld_open(
822       rtld, (struct ac_rtld_open_info){.info = &screen->info,
823                                        .options =
824                                           {
825                                              .halt_at_entry = screen->options.halt_shaders,
826                                           },
827                                        .shader_type = sel->info.stage,
828                                        .wave_size = shader->wave_size,
829                                        .num_parts = num_parts,
830                                        .elf_ptrs = part_elfs,
831                                        .elf_sizes = part_sizes,
832                                        .num_shared_lds_symbols = num_lds_symbols,
833                                        .shared_lds_symbols = lds_symbols});
834 
835    if (rtld->lds_size > 0) {
836       unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
837       shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity;
838    }
839 
840    return ok;
841 }
842 
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)843 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
844 {
845    struct ac_rtld_binary rtld;
846    si_shader_binary_open(screen, shader, &rtld);
847    uint64_t size = rtld.exec_size;
848    ac_rtld_close(&rtld);
849    return size;
850 }
851 
si_get_external_symbol(void * data,const char * name,uint64_t * value)852 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
853 {
854    uint64_t *scratch_va = data;
855 
856    if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
857       *value = (uint32_t)*scratch_va;
858       return true;
859    }
860    if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
861       /* Enable scratch coalescing. */
862       *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
863       return true;
864    }
865 
866    return false;
867 }
868 
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)869 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
870                              uint64_t scratch_va)
871 {
872    struct ac_rtld_binary binary;
873    if (!si_shader_binary_open(sscreen, shader, &binary))
874       return false;
875 
876    si_resource_reference(&shader->bo, NULL);
877    shader->bo = si_aligned_buffer_create(
878       &sscreen->b,
879       (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
880       SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT,
881       PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
882    if (!shader->bo)
883       return false;
884 
885    /* Upload. */
886    struct ac_rtld_upload_info u = {};
887    u.binary = &binary;
888    u.get_external_symbol = si_get_external_symbol;
889    u.cb_data = &scratch_va;
890    u.rx_va = shader->bo->gpu_address;
891    u.rx_ptr = sscreen->ws->buffer_map(sscreen->ws,
892       shader->bo->buf, NULL,
893       PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
894    if (!u.rx_ptr)
895       return false;
896 
897    int size = ac_rtld_upload(&u);
898 
899    if (sscreen->debug_flags & DBG(SQTT)) {
900       /* Remember the uploaded code */
901       shader->binary.uploaded_code_size = size;
902       shader->binary.uploaded_code = malloc(size);
903       memcpy(shader->binary.uploaded_code, u.rx_ptr, size);
904    }
905 
906    sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
907    ac_rtld_close(&binary);
908 
909    return size >= 0;
910 }
911 
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct pipe_debug_callback * debug,const char * name,FILE * file)912 static void si_shader_dump_disassembly(struct si_screen *screen,
913                                        const struct si_shader_binary *binary,
914                                        gl_shader_stage stage, unsigned wave_size,
915                                        struct pipe_debug_callback *debug, const char *name,
916                                        FILE *file)
917 {
918    struct ac_rtld_binary rtld_binary;
919 
920    if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
921                                       .info = &screen->info,
922                                       .shader_type = stage,
923                                       .wave_size = wave_size,
924                                       .num_parts = 1,
925                                       .elf_ptrs = &binary->elf_buffer,
926                                       .elf_sizes = &binary->elf_size}))
927       return;
928 
929    const char *disasm;
930    size_t nbytes;
931 
932    if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
933       goto out;
934 
935    if (nbytes > INT_MAX)
936       goto out;
937 
938    if (debug && debug->debug_message) {
939       /* Very long debug messages are cut off, so send the
940        * disassembly one line at a time. This causes more
941        * overhead, but on the plus side it simplifies
942        * parsing of resulting logs.
943        */
944       pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
945 
946       uint64_t line = 0;
947       while (line < nbytes) {
948          int count = nbytes - line;
949          const char *nl = memchr(disasm + line, '\n', nbytes - line);
950          if (nl)
951             count = nl - (disasm + line);
952 
953          if (count) {
954             pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
955          }
956 
957          line += count + 1;
958       }
959 
960       pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
961    }
962 
963    if (file) {
964       fprintf(file, "Shader %s disassembly:\n", name);
965       fprintf(file, "%*s", (int)nbytes, disasm);
966    }
967 
968 out:
969    ac_rtld_close(&rtld_binary);
970 }
971 
si_calculate_max_simd_waves(struct si_shader * shader)972 static void si_calculate_max_simd_waves(struct si_shader *shader)
973 {
974    struct si_screen *sscreen = shader->selector->screen;
975    struct ac_shader_config *conf = &shader->config;
976    unsigned num_inputs = shader->selector->info.num_inputs;
977    unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
978    unsigned lds_per_wave = 0;
979    unsigned max_simd_waves;
980 
981    max_simd_waves = sscreen->info.max_wave64_per_simd;
982 
983    /* Compute LDS usage for PS. */
984    switch (shader->selector->info.stage) {
985    case MESA_SHADER_FRAGMENT:
986       /* The minimum usage per wave is (num_inputs * 48). The maximum
987        * usage is (num_inputs * 48 * 16).
988        * We can get anything in between and it varies between waves.
989        *
990        * The 48 bytes per input for a single primitive is equal to
991        * 4 bytes/component * 4 components/input * 3 points.
992        *
993        * Other stages don't know the size at compile time or don't
994        * allocate LDS per wave, but instead they do it per thread group.
995        */
996       lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
997       break;
998    case MESA_SHADER_COMPUTE: {
999          unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1000          lds_per_wave = (conf->lds_size * lds_increment) /
1001                         DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1002       }
1003       break;
1004    default:;
1005    }
1006 
1007    /* Compute the per-SIMD wave counts. */
1008    if (conf->num_sgprs) {
1009       max_simd_waves =
1010          MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1011    }
1012 
1013    if (conf->num_vgprs) {
1014       /* Always print wave limits as Wave64, so that we can compare
1015        * Wave32 and Wave64 with shader-db fairly. */
1016       unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1017       max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
1018    }
1019 
1020    unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1021    if (lds_per_wave)
1022       max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1023 
1024    shader->info.max_simd_waves = max_simd_waves;
1025 }
1026 
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct pipe_debug_callback * debug)1027 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1028                                         struct pipe_debug_callback *debug)
1029 {
1030    const struct ac_shader_config *conf = &shader->config;
1031    static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1032 
1033    if (screen->options.debug_disassembly)
1034       si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage,
1035                                  shader->wave_size, debug, "main", NULL);
1036 
1037    pipe_debug_message(debug, SHADER_INFO,
1038                       "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1039                       "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1040                       "Spilled VGPRs: %d PrivMem VGPRs: %d DivergentLoop: %d, InlineUniforms: %d, "
1041                       "ParamExports: %u, (%s, W%u)",
1042                       conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1043                       conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1044                       conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1045                       shader->selector->info.has_divergent_loop,
1046                       shader->selector->info.base.num_inlinable_uniforms,
1047                       shader->info.nr_param_exports,
1048                       stages[shader->selector->info.stage], shader->wave_size);
1049 }
1050 
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1051 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1052                                  bool check_debug_option)
1053 {
1054    const struct ac_shader_config *conf = &shader->config;
1055 
1056    if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->info.stage)) {
1057       if (shader->selector->info.stage == MESA_SHADER_FRAGMENT) {
1058          fprintf(file,
1059                  "*** SHADER CONFIG ***\n"
1060                  "SPI_PS_INPUT_ADDR = 0x%04x\n"
1061                  "SPI_PS_INPUT_ENA  = 0x%04x\n",
1062                  conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1063       }
1064 
1065       fprintf(file,
1066               "*** SHADER STATS ***\n"
1067               "SGPRS: %d\n"
1068               "VGPRS: %d\n"
1069               "Spilled SGPRs: %d\n"
1070               "Spilled VGPRs: %d\n"
1071               "Private memory VGPRs: %d\n"
1072               "Code Size: %d bytes\n"
1073               "LDS: %d blocks\n"
1074               "Scratch: %d bytes per wave\n"
1075               "Max Waves: %d\n"
1076               "********************\n\n\n",
1077               conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1078               shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1079               conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1080    }
1081 }
1082 
si_get_shader_name(const struct si_shader * shader)1083 const char *si_get_shader_name(const struct si_shader *shader)
1084 {
1085    switch (shader->selector->info.stage) {
1086    case MESA_SHADER_VERTEX:
1087       if (shader->key.ge.as_es)
1088          return "Vertex Shader as ES";
1089       else if (shader->key.ge.as_ls)
1090          return "Vertex Shader as LS";
1091       else if (shader->key.ge.as_ngg)
1092          return "Vertex Shader as ESGS";
1093       else
1094          return "Vertex Shader as VS";
1095    case MESA_SHADER_TESS_CTRL:
1096       return "Tessellation Control Shader";
1097    case MESA_SHADER_TESS_EVAL:
1098       if (shader->key.ge.as_es)
1099          return "Tessellation Evaluation Shader as ES";
1100       else if (shader->key.ge.as_ngg)
1101          return "Tessellation Evaluation Shader as ESGS";
1102       else
1103          return "Tessellation Evaluation Shader as VS";
1104    case MESA_SHADER_GEOMETRY:
1105       if (shader->is_gs_copy_shader)
1106          return "GS Copy Shader as VS";
1107       else
1108          return "Geometry Shader";
1109    case MESA_SHADER_FRAGMENT:
1110       return "Pixel Shader";
1111    case MESA_SHADER_COMPUTE:
1112       return "Compute Shader";
1113    default:
1114       return "Unknown Shader";
1115    }
1116 }
1117 
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct pipe_debug_callback * debug,FILE * file,bool check_debug_option)1118 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1119                     struct pipe_debug_callback *debug, FILE *file, bool check_debug_option)
1120 {
1121    gl_shader_stage stage = shader->selector->info.stage;
1122 
1123    if (!check_debug_option || si_can_dump_shader(sscreen, stage))
1124       si_dump_shader_key(shader, file);
1125 
1126    if (!check_debug_option && shader->binary.llvm_ir_string) {
1127       if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1128          fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1129          fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1130       }
1131 
1132       fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1133       fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1134    }
1135 
1136    if (!check_debug_option ||
1137        (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
1138 
1139       fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1140 
1141       if (shader->prolog)
1142          si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1143                                     "prolog", file);
1144       if (shader->previous_stage)
1145          si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1146                                     shader->wave_size, debug, "previous stage", file);
1147       si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1148                                  file);
1149 
1150       if (shader->epilog)
1151          si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1152                                     "epilog", file);
1153       fprintf(file, "\n");
1154    }
1155 
1156    si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1157 }
1158 
si_dump_shader_key_vs(const union si_shader_key * key,const struct si_vs_prolog_bits * prolog,const char * prefix,FILE * f)1159 static void si_dump_shader_key_vs(const union si_shader_key *key,
1160                                   const struct si_vs_prolog_bits *prolog, const char *prefix,
1161                                   FILE *f)
1162 {
1163    fprintf(f, "  %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
1164    fprintf(f, "  %s.instance_divisor_is_fetched = %u\n", prefix,
1165            prolog->instance_divisor_is_fetched);
1166    fprintf(f, "  %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
1167 
1168    fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1169    fprintf(f, "  mono.vs.fix_fetch = {");
1170    for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1171       union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1172       if (i)
1173          fprintf(f, ", ");
1174       if (!fix.bits)
1175          fprintf(f, "0");
1176       else
1177          fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1178                  fix.u.format);
1179    }
1180    fprintf(f, "}\n");
1181 }
1182 
si_dump_shader_key(const struct si_shader * shader,FILE * f)1183 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1184 {
1185    const union si_shader_key *key = &shader->key;
1186    gl_shader_stage stage = shader->selector->info.stage;
1187 
1188    fprintf(f, "SHADER KEY\n");
1189    fprintf(f, "  source_sha1 = {");
1190    _mesa_sha1_print(f, shader->selector->info.base.source_sha1);
1191    fprintf(f, "}\n");
1192 
1193    switch (stage) {
1194    case MESA_SHADER_VERTEX:
1195       si_dump_shader_key_vs(key, &key->ge.part.vs.prolog, "part.vs.prolog", f);
1196       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1197       fprintf(f, "  as_ls = %u\n", key->ge.as_ls);
1198       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1199       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1200       break;
1201 
1202    case MESA_SHADER_TESS_CTRL:
1203       if (shader->selector->screen->info.chip_class >= GFX9) {
1204          si_dump_shader_key_vs(key, &key->ge.part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
1205       }
1206       fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->ge.part.tcs.epilog.prim_mode);
1207       fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
1208               key->ge.mono.u.ff_tcs_inputs_to_copy);
1209       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1210       fprintf(f, "  opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1211       break;
1212 
1213    case MESA_SHADER_TESS_EVAL:
1214       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1215       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1216       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1217       break;
1218 
1219    case MESA_SHADER_GEOMETRY:
1220       if (shader->is_gs_copy_shader)
1221          break;
1222 
1223       if (shader->selector->screen->info.chip_class >= GFX9 &&
1224           key->ge.part.gs.es->info.stage == MESA_SHADER_VERTEX) {
1225          si_dump_shader_key_vs(key, &key->ge.part.gs.vs_prolog, "part.gs.vs_prolog", f);
1226       }
1227       fprintf(f, "  mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1228       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1229       break;
1230 
1231    case MESA_SHADER_COMPUTE:
1232       break;
1233 
1234    case MESA_SHADER_FRAGMENT:
1235       fprintf(f, "  prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1236       fprintf(f, "  prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1237       fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1238       fprintf(f, "  prolog.force_persp_sample_interp = %u\n",
1239               key->ps.part.prolog.force_persp_sample_interp);
1240       fprintf(f, "  prolog.force_linear_sample_interp = %u\n",
1241               key->ps.part.prolog.force_linear_sample_interp);
1242       fprintf(f, "  prolog.force_persp_center_interp = %u\n",
1243               key->ps.part.prolog.force_persp_center_interp);
1244       fprintf(f, "  prolog.force_linear_center_interp = %u\n",
1245               key->ps.part.prolog.force_linear_center_interp);
1246       fprintf(f, "  prolog.bc_optimize_for_persp = %u\n",
1247               key->ps.part.prolog.bc_optimize_for_persp);
1248       fprintf(f, "  prolog.bc_optimize_for_linear = %u\n",
1249               key->ps.part.prolog.bc_optimize_for_linear);
1250       fprintf(f, "  prolog.samplemask_log_ps_iter = %u\n",
1251               key->ps.part.prolog.samplemask_log_ps_iter);
1252       fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n",
1253               key->ps.part.epilog.spi_shader_col_format);
1254       fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1255       fprintf(f, "  epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1256       fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
1257       fprintf(f, "  epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1258       fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1259       fprintf(f, "  epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1260       fprintf(f, "  mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1261       fprintf(f, "  mono.interpolate_at_sample_force_center = %u\n",
1262               key->ps.mono.interpolate_at_sample_force_center);
1263       fprintf(f, "  mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1264       fprintf(f, "  mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1265       fprintf(f, "  mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1266       break;
1267 
1268    default:
1269       assert(0);
1270    }
1271 
1272    if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1273         stage == MESA_SHADER_VERTEX) &&
1274        !key->ge.as_es && !key->ge.as_ls) {
1275       fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1276       fprintf(f, "  opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize);
1277       fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1278       fprintf(f, "  opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1279    }
1280 
1281    if (stage <= MESA_SHADER_GEOMETRY)
1282       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1283    else
1284       fprintf(f, "  opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1285 
1286    if (stage <= MESA_SHADER_GEOMETRY) {
1287       if (key->ge.opt.inline_uniforms) {
1288          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1289                  key->ge.opt.inline_uniforms,
1290                  key->ge.opt.inlined_uniform_values[0],
1291                  key->ge.opt.inlined_uniform_values[1],
1292                  key->ge.opt.inlined_uniform_values[2],
1293                  key->ge.opt.inlined_uniform_values[3]);
1294       } else {
1295          fprintf(f, "  opt.inline_uniforms = 0\n");
1296       }
1297    } else {
1298       if (key->ps.opt.inline_uniforms) {
1299          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1300                  key->ps.opt.inline_uniforms,
1301                  key->ps.opt.inlined_uniform_values[0],
1302                  key->ps.opt.inlined_uniform_values[1],
1303                  key->ps.opt.inlined_uniform_values[2],
1304                  key->ps.opt.inlined_uniform_values[3]);
1305       } else {
1306          fprintf(f, "  opt.inline_uniforms = 0\n");
1307       }
1308    }
1309 }
1310 
si_vs_needs_prolog(const struct si_shader_selector * sel,const struct si_vs_prolog_bits * prolog_key,const union si_shader_key * key,bool ngg_cull_shader,bool is_gs)1311 bool si_vs_needs_prolog(const struct si_shader_selector *sel,
1312                         const struct si_vs_prolog_bits *prolog_key,
1313                         const union si_shader_key *key, bool ngg_cull_shader,
1314                         bool is_gs)
1315 {
1316    assert(sel->info.stage == MESA_SHADER_VERTEX);
1317 
1318    /* VGPR initialization fixup for Vega10 and Raven is always done in the
1319     * VS prolog. */
1320    return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
1321           /* The 2nd VS prolog loads input VGPRs from LDS */
1322           (key->ge.opt.ngg_culling && !ngg_cull_shader && !is_gs);
1323 }
1324 
1325 /**
1326  * Compute the VS prolog key, which contains all the information needed to
1327  * build the VS prolog function, and set shader->info bits where needed.
1328  *
1329  * \param info             Shader info of the vertex shader.
1330  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
1331  * \param has_old_  Whether the preceding shader part is the NGG cull shader.
1332  * \param prolog_key       Key of the VS prolog
1333  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
1334  * \param key              Output shader part key.
1335  */
si_get_vs_prolog_key(const struct si_shader_info * info,unsigned num_input_sgprs,bool ngg_cull_shader,const struct si_vs_prolog_bits * prolog_key,struct si_shader * shader_out,union si_shader_part_key * key)1336 void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
1337                           bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
1338                           struct si_shader *shader_out, union si_shader_part_key *key)
1339 {
1340    memset(key, 0, sizeof(*key));
1341    key->vs_prolog.states = *prolog_key;
1342    key->vs_prolog.wave32 = shader_out->wave_size == 32;
1343    key->vs_prolog.num_input_sgprs = num_input_sgprs;
1344    key->vs_prolog.num_inputs = info->num_inputs;
1345    key->vs_prolog.as_ls = shader_out->key.ge.as_ls;
1346    key->vs_prolog.as_es = shader_out->key.ge.as_es;
1347    key->vs_prolog.as_ngg = shader_out->key.ge.as_ngg;
1348 
1349    if (shader_out->selector->info.stage != MESA_SHADER_GEOMETRY &&
1350        !ngg_cull_shader && shader_out->key.ge.opt.ngg_culling)
1351       key->vs_prolog.load_vgprs_after_culling = 1;
1352 
1353    if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
1354       key->vs_prolog.as_ls = 1;
1355       key->vs_prolog.num_merged_next_stage_vgprs = 2;
1356    } else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) {
1357       key->vs_prolog.as_es = 1;
1358       key->vs_prolog.num_merged_next_stage_vgprs = 5;
1359    } else if (shader_out->key.ge.as_ngg) {
1360       key->vs_prolog.num_merged_next_stage_vgprs = 5;
1361    }
1362 
1363    /* Only one of these combinations can be set. as_ngg can be set with as_es. */
1364    assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
1365           (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);
1366 
1367    /* Enable loading the InstanceID VGPR. */
1368    uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
1369 
1370    if ((key->vs_prolog.states.instance_divisor_is_one |
1371         key->vs_prolog.states.instance_divisor_is_fetched) &
1372        input_mask)
1373       shader_out->info.uses_instanceid = true;
1374 }
1375 
si_get_nir_shader(struct si_shader_selector * sel,const union si_shader_key * key,bool * free_nir)1376 struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel,
1377                                      const union si_shader_key *key,
1378                                      bool *free_nir)
1379 {
1380    nir_shader *nir;
1381    *free_nir = false;
1382 
1383    if (sel->nir) {
1384       nir = sel->nir;
1385    } else if (sel->nir_binary) {
1386       struct pipe_screen *screen = &sel->screen->b;
1387       const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
1388                                                          pipe_shader_type_from_mesa(sel->info.stage));
1389 
1390       struct blob_reader blob_reader;
1391       blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1392       *free_nir = true;
1393       nir = nir_deserialize(NULL, options, &blob_reader);
1394    } else {
1395       return NULL;
1396    }
1397 
1398    bool progress = false;
1399 
1400    bool inline_uniforms = false;
1401    uint32_t *inlined_uniform_values;
1402    si_get_inline_uniform_state((union si_shader_key*)key, sel->pipe_shader_type,
1403                                &inline_uniforms, &inlined_uniform_values);
1404 
1405    if (inline_uniforms) {
1406       assert(*free_nir);
1407 
1408       /* Most places use shader information from the default variant, not
1409        * the optimized variant. These are the things that the driver looks at
1410        * in optimized variants and the list of things that we need to do.
1411        *
1412        * The driver takes into account these things if they suddenly disappear
1413        * from the shader code:
1414        * - Register usage and code size decrease (obvious)
1415        * - Eliminated PS system values are disabled by LLVM
1416        *   (FragCoord, FrontFace, barycentrics)
1417        * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef.
1418        *   (thanks to an LLVM pass in Mesa - TODO: move it to NIR)
1419        *   The storage for eliminated outputs is also not allocated.
1420        * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
1421        * - TCS output stores are eliminated
1422        *
1423        * TODO: These are things the driver ignores in the final shader code
1424        * and relies on the default shader info.
1425        * - Other system values are not eliminated
1426        * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs
1427        *   to remove holes
1428        * - uses_discard - if it changed to false
1429        * - writes_memory - if it changed to false
1430        * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
1431        *   eliminated
1432        * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
1433        *   GS outputs are eliminated except for the temporary LDS.
1434        *   Clip distances, gl_PointSize, and PS outputs are eliminated based
1435        *   on current states, so we don't care about the shader code.
1436        *
1437        * TODO: Merged shaders don't inline uniforms for the first stage.
1438        * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
1439        * (key == NULL for the first stage here)
1440        *
1441        * TODO: Compute shaders don't support inlinable uniforms, because they
1442        * don't have shader variants.
1443        *
1444        * TODO: The driver uses a linear search to find a shader variant. This
1445        * can be really slow if we get too many variants due to uniform inlining.
1446        */
1447       NIR_PASS_V(nir, nir_inline_uniforms,
1448                  nir->info.num_inlinable_uniforms,
1449                  inlined_uniform_values,
1450                  nir->info.inlinable_uniform_dw_offsets);
1451       progress = true;
1452    }
1453 
1454    if (progress)
1455       si_nir_opts(sel->screen, nir, true);
1456 
1457    /* Lower large variables that are always constant with load_constant intrinsics, which
1458     * get turned into PC-relative loads from a data section next to the shader.
1459     *
1460     * Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
1461     * this should be done after that.
1462     *
1463     * The pass crashes if there are dead temps of lowered IO interface types, so remove
1464     * them first.
1465     */
1466    bool progress2 = false;
1467    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
1468    NIR_PASS(progress2, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
1469 
1470    /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
1471     * this should be done after that.
1472     */
1473    progress2 |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.chip_class);
1474    if (progress2)
1475       si_nir_opts(sel->screen, nir, false);
1476 
1477    if (progress || progress2)
1478       si_nir_late_opts(nir);
1479 
1480    /* This must be done again. */
1481    NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
1482 
1483    /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
1484     * 200 is tuned for Viewperf. It should be done last.
1485     */
1486    NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
1487 
1488    return nir;
1489 }
1490 
si_update_shader_binary_info(struct si_shader * shader,nir_shader * nir)1491 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
1492 {
1493    struct si_shader_info info;
1494    si_nir_scan_shader(nir, &info);
1495 
1496    shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
1497    shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
1498 }
1499 
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct pipe_debug_callback * debug)1500 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1501                        struct si_shader *shader, struct pipe_debug_callback *debug)
1502 {
1503    struct si_shader_selector *sel = shader->selector;
1504    bool free_nir;
1505    struct nir_shader *nir = si_get_nir_shader(sel, &shader->key, &free_nir);
1506 
1507    /* Dump NIR before doing NIR->LLVM conversion in case the
1508     * conversion fails. */
1509    if (si_can_dump_shader(sscreen, sel->info.stage) &&
1510        !(sscreen->debug_flags & DBG(NO_NIR))) {
1511       nir_print_shader(nir, stderr);
1512       si_dump_streamout(&sel->so);
1513    }
1514 
1515    /* Initialize vs_output_ps_input_cntl to default. */
1516    for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
1517       shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
1518    shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
1519 
1520    si_update_shader_binary_info(shader, nir);
1521 
1522    shader->info.uses_instanceid = sel->info.uses_instanceid;
1523    shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
1524 
1525    /* TODO: ACO could compile non-monolithic shaders here (starting
1526     * with PS and NGG VS), but monolithic shaders should be compiled
1527     * by LLVM due to more complicated compilation.
1528     */
1529    if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
1530       return false;
1531 
1532    /* The GS copy shader is compiled next. */
1533    if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
1534       shader->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
1535       if (!shader->gs_copy_shader) {
1536          fprintf(stderr, "radeonsi: can't create GS copy shader\n");
1537          return false;
1538       }
1539    }
1540 
1541    /* Compute vs_output_ps_input_cntl. */
1542    if ((sel->info.stage == MESA_SHADER_VERTEX ||
1543         sel->info.stage == MESA_SHADER_TESS_EVAL ||
1544         sel->info.stage == MESA_SHADER_GEOMETRY) &&
1545        !shader->key.ge.as_ls && !shader->key.ge.as_es) {
1546       ubyte *vs_output_param_offset = shader->info.vs_output_param_offset;
1547 
1548       if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
1549          vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
1550 
1551       /* VS and TES should also set primitive ID output if it's used. */
1552       unsigned num_outputs_with_prim_id = sel->info.num_outputs +
1553                                           shader->key.ge.mono.u.vs_export_prim_id;
1554 
1555       for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
1556          unsigned semantic = sel->info.output_semantic[i];
1557          unsigned offset = vs_output_param_offset[i];
1558          unsigned ps_input_cntl;
1559 
1560          if (offset <= AC_EXP_PARAM_OFFSET_31) {
1561             /* The input is loaded from parameter memory. */
1562             ps_input_cntl = S_028644_OFFSET(offset);
1563          } else {
1564             /* The input is a DEFAULT_VAL constant. */
1565             assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
1566                    offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
1567             offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
1568 
1569             /* OFFSET=0x20 means that DEFAULT_VAL is used. */
1570             ps_input_cntl = S_028644_OFFSET(0x20) |
1571                             S_028644_DEFAULT_VAL(offset);
1572          }
1573 
1574          shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
1575       }
1576    }
1577 
1578    /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
1579    if (sel->info.stage == MESA_SHADER_COMPUTE) {
1580       unsigned max_vgprs =
1581          sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
1582       unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
1583       unsigned max_sgprs_per_wave = 128;
1584       unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
1585       unsigned threads_per_tg = si_get_max_workgroup_size(shader);
1586       unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
1587       unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
1588 
1589       max_vgprs = max_vgprs / waves_per_simd;
1590       max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
1591 
1592       if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
1593          fprintf(stderr,
1594                  "LLVM failed to compile a shader correctly: "
1595                  "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
1596                  shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
1597 
1598          /* Just terminate the process, because dependent
1599           * shaders can hang due to bad input data, but use
1600           * the env var to allow shader-db to work.
1601           */
1602          if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
1603             abort();
1604       }
1605    }
1606 
1607    /* Add the scratch offset to input SGPRs. */
1608    if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
1609       shader->info.num_input_sgprs += 1; /* scratch byte offset */
1610 
1611    /* Calculate the number of fragment input VGPRs. */
1612    if (sel->info.stage == MESA_SHADER_FRAGMENT) {
1613       shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
1614          &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index,
1615          &shader->info.sample_coverage_vgpr_index);
1616    }
1617 
1618    si_calculate_max_simd_waves(shader);
1619    si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
1620    return true;
1621 }
1622 
1623 /**
1624  * Create, compile and return a shader part (prolog or epilog).
1625  *
1626  * \param sscreen	screen
1627  * \param list		list of shader parts of the same category
1628  * \param type		shader type
1629  * \param key		shader part key
1630  * \param prolog	whether the part being requested is a prolog
1631  * \param tm		LLVM target machine
1632  * \param debug		debug callback
1633  * \param build		the callback responsible for building the main function
1634  * \return		non-NULL on success
1635  */
1636 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct pipe_debug_callback * debug,void (* build)(struct si_shader_context *,union si_shader_part_key *),const char * name)1637 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
1638                    gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
1639                    struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug,
1640                    void (*build)(struct si_shader_context *, union si_shader_part_key *),
1641                    const char *name)
1642 {
1643    struct si_shader_part *result;
1644 
1645    simple_mtx_lock(&sscreen->shader_parts_mutex);
1646 
1647    /* Find existing. */
1648    for (result = *list; result; result = result->next) {
1649       if (memcmp(&result->key, key, sizeof(*key)) == 0) {
1650          simple_mtx_unlock(&sscreen->shader_parts_mutex);
1651          return result;
1652       }
1653    }
1654 
1655    /* Compile a new one. */
1656    result = CALLOC_STRUCT(si_shader_part);
1657    result->key = *key;
1658 
1659    struct si_shader_selector sel = {};
1660    sel.screen = sscreen;
1661 
1662    struct si_shader shader = {};
1663    shader.selector = &sel;
1664    bool wave32 = false;
1665 
1666    switch (stage) {
1667    case MESA_SHADER_VERTEX:
1668       shader.key.ge.as_ls = key->vs_prolog.as_ls;
1669       shader.key.ge.as_es = key->vs_prolog.as_es;
1670       shader.key.ge.as_ngg = key->vs_prolog.as_ngg;
1671       wave32 = key->vs_prolog.wave32;
1672       break;
1673    case MESA_SHADER_TESS_CTRL:
1674       assert(!prolog);
1675       shader.key.ge.part.tcs.epilog = key->tcs_epilog.states;
1676       wave32 = key->tcs_epilog.wave32;
1677       break;
1678    case MESA_SHADER_FRAGMENT:
1679       if (prolog) {
1680          shader.key.ps.part.prolog = key->ps_prolog.states;
1681          wave32 = key->ps_prolog.wave32;
1682       } else {
1683          shader.key.ps.part.epilog = key->ps_epilog.states;
1684          wave32 = key->ps_epilog.wave32;
1685       }
1686       break;
1687    default:
1688       unreachable("bad shader part");
1689    }
1690 
1691    struct si_shader_context ctx;
1692    si_llvm_context_init(&ctx, sscreen, compiler, wave32 ? 32 : 64);
1693 
1694    ctx.shader = &shader;
1695    ctx.stage = stage;
1696 
1697    build(&ctx, key);
1698 
1699    /* Compile. */
1700    si_llvm_optimize_module(&ctx);
1701 
1702    if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
1703                         ctx.stage, name, false)) {
1704       FREE(result);
1705       result = NULL;
1706       goto out;
1707    }
1708 
1709    result->next = *list;
1710    *list = result;
1711 
1712 out:
1713    si_llvm_dispose(&ctx);
1714    simple_mtx_unlock(&sscreen->shader_parts_mutex);
1715    return result;
1716 }
1717 
si_get_vs_prolog(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct pipe_debug_callback * debug,struct si_shader * main_part,const struct si_vs_prolog_bits * key)1718 static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1719                              struct si_shader *shader, struct pipe_debug_callback *debug,
1720                              struct si_shader *main_part, const struct si_vs_prolog_bits *key)
1721 {
1722    struct si_shader_selector *vs = main_part->selector;
1723 
1724    if (!si_vs_needs_prolog(vs, key, &shader->key, false,
1725                            shader->selector->info.stage == MESA_SHADER_GEOMETRY))
1726       return true;
1727 
1728    /* Get the prolog. */
1729    union si_shader_part_key prolog_key;
1730    si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
1731                         &prolog_key);
1732 
1733    shader->prolog =
1734       si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key,
1735                          compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
1736    return shader->prolog != NULL;
1737 }
1738 
1739 /**
1740  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
1741  */
si_shader_select_vs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct pipe_debug_callback * debug)1742 static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1743                                       struct si_shader *shader, struct pipe_debug_callback *debug)
1744 {
1745    return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.ge.part.vs.prolog);
1746 }
1747 
1748 /**
1749  * Select and compile (or reuse) TCS parts (epilog).
1750  */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct pipe_debug_callback * debug)1751 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1752                                        struct si_shader *shader, struct pipe_debug_callback *debug)
1753 {
1754    if (sscreen->info.chip_class >= GFX9) {
1755       struct si_shader *ls_main_part = shader->key.ge.part.tcs.ls->main_shader_part_ls;
1756 
1757       if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
1758                             &shader->key.ge.part.tcs.ls_prolog))
1759          return false;
1760 
1761       shader->previous_stage = ls_main_part;
1762    }
1763 
1764    /* Get the epilog. */
1765    union si_shader_part_key epilog_key;
1766    memset(&epilog_key, 0, sizeof(epilog_key));
1767    epilog_key.tcs_epilog.wave32 = shader->wave_size == 32;
1768    epilog_key.tcs_epilog.states = shader->key.ge.part.tcs.epilog;
1769 
1770    shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
1771                                        &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
1772                                        "Tessellation Control Shader Epilog");
1773    return shader->epilog != NULL;
1774 }
1775 
1776 /**
1777  * Select and compile (or reuse) GS parts (prolog).
1778  */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct pipe_debug_callback * debug)1779 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1780                                       struct si_shader *shader, struct pipe_debug_callback *debug)
1781 {
1782    if (sscreen->info.chip_class >= GFX9) {
1783       struct si_shader *es_main_part;
1784 
1785       if (shader->key.ge.as_ngg)
1786          es_main_part = shader->key.ge.part.gs.es->main_shader_part_ngg_es;
1787       else
1788          es_main_part = shader->key.ge.part.gs.es->main_shader_part_es;
1789 
1790       if (shader->key.ge.part.gs.es->info.stage == MESA_SHADER_VERTEX &&
1791           !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
1792                             &shader->key.ge.part.gs.vs_prolog))
1793          return false;
1794 
1795       shader->previous_stage = es_main_part;
1796    }
1797 
1798    return true;
1799 }
1800 
1801 /**
1802  * Compute the PS prolog key, which contains all the information needed to
1803  * build the PS prolog function, and set related bits in shader->config.
1804  */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key,bool separate_prolog)1805 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
1806                           bool separate_prolog)
1807 {
1808    struct si_shader_info *info = &shader->selector->info;
1809 
1810    memset(key, 0, sizeof(*key));
1811    key->ps_prolog.states = shader->key.ps.part.prolog;
1812    key->ps_prolog.wave32 = shader->wave_size == 32;
1813    key->ps_prolog.colors_read = info->colors_read;
1814    key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
1815    key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
1816    key->ps_prolog.wqm =
1817       info->base.fs.needs_quad_helper_invocations &&
1818       (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
1819        key->ps_prolog.states.force_linear_sample_interp ||
1820        key->ps_prolog.states.force_persp_center_interp ||
1821        key->ps_prolog.states.force_linear_center_interp ||
1822        key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
1823    key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
1824    key->ps_prolog.sample_coverage_vgpr_index = shader->info.sample_coverage_vgpr_index;
1825 
1826    if (shader->key.ps.part.prolog.poly_stipple)
1827       shader->info.uses_vmem_load_other = true;
1828 
1829    if (info->colors_read) {
1830       ubyte *color = shader->selector->color_attr_index;
1831 
1832       if (shader->key.ps.part.prolog.color_two_side) {
1833          /* BCOLORs are stored after the last input. */
1834          key->ps_prolog.num_interp_inputs = info->num_inputs;
1835          key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
1836          if (separate_prolog)
1837             shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
1838       }
1839 
1840       for (unsigned i = 0; i < 2; i++) {
1841          unsigned interp = info->color_interpolate[i];
1842          unsigned location = info->color_interpolate_loc[i];
1843 
1844          if (!(info->colors_read & (0xf << i * 4)))
1845             continue;
1846 
1847          key->ps_prolog.color_attr_index[i] = color[i];
1848 
1849          if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
1850             interp = INTERP_MODE_FLAT;
1851 
1852          switch (interp) {
1853          case INTERP_MODE_FLAT:
1854             key->ps_prolog.color_interp_vgpr_index[i] = -1;
1855             break;
1856          case INTERP_MODE_SMOOTH:
1857          case INTERP_MODE_COLOR:
1858             /* Force the interpolation location for colors here. */
1859             if (shader->key.ps.part.prolog.force_persp_sample_interp)
1860                location = TGSI_INTERPOLATE_LOC_SAMPLE;
1861             if (shader->key.ps.part.prolog.force_persp_center_interp)
1862                location = TGSI_INTERPOLATE_LOC_CENTER;
1863 
1864             switch (location) {
1865             case TGSI_INTERPOLATE_LOC_SAMPLE:
1866                key->ps_prolog.color_interp_vgpr_index[i] = 0;
1867                if (separate_prolog) {
1868                   shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
1869                }
1870                break;
1871             case TGSI_INTERPOLATE_LOC_CENTER:
1872                key->ps_prolog.color_interp_vgpr_index[i] = 2;
1873                if (separate_prolog) {
1874                   shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
1875                }
1876                break;
1877             case TGSI_INTERPOLATE_LOC_CENTROID:
1878                key->ps_prolog.color_interp_vgpr_index[i] = 4;
1879                if (separate_prolog) {
1880                   shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
1881                }
1882                break;
1883             default:
1884                assert(0);
1885             }
1886             break;
1887          case INTERP_MODE_NOPERSPECTIVE:
1888             /* Force the interpolation location for colors here. */
1889             if (shader->key.ps.part.prolog.force_linear_sample_interp)
1890                location = TGSI_INTERPOLATE_LOC_SAMPLE;
1891             if (shader->key.ps.part.prolog.force_linear_center_interp)
1892                location = TGSI_INTERPOLATE_LOC_CENTER;
1893 
1894             /* The VGPR assignment for non-monolithic shaders
1895              * works because InitialPSInputAddr is set on the
1896              * main shader and PERSP_PULL_MODEL is never used.
1897              */
1898             switch (location) {
1899             case TGSI_INTERPOLATE_LOC_SAMPLE:
1900                key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
1901                if (separate_prolog) {
1902                   shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
1903                }
1904                break;
1905             case TGSI_INTERPOLATE_LOC_CENTER:
1906                key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
1907                if (separate_prolog) {
1908                   shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
1909                }
1910                break;
1911             case TGSI_INTERPOLATE_LOC_CENTROID:
1912                key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
1913                if (separate_prolog) {
1914                   shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
1915                }
1916                break;
1917             default:
1918                assert(0);
1919             }
1920             break;
1921          default:
1922             assert(0);
1923          }
1924       }
1925    }
1926 }
1927 
1928 /**
1929  * Check whether a PS prolog is required based on the key.
1930  */
si_need_ps_prolog(const union si_shader_part_key * key)1931 bool si_need_ps_prolog(const union si_shader_part_key *key)
1932 {
1933    return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
1934           key->ps_prolog.states.force_linear_sample_interp ||
1935           key->ps_prolog.states.force_persp_center_interp ||
1936           key->ps_prolog.states.force_linear_center_interp ||
1937           key->ps_prolog.states.bc_optimize_for_persp ||
1938           key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
1939           key->ps_prolog.states.samplemask_log_ps_iter;
1940 }
1941 
1942 /**
1943  * Compute the PS epilog key, which contains all the information needed to
1944  * build the PS epilog function.
1945  */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)1946 void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
1947 {
1948    struct si_shader_info *info = &shader->selector->info;
1949    memset(key, 0, sizeof(*key));
1950    key->ps_epilog.wave32 = shader->wave_size == 32;
1951    key->ps_epilog.colors_written = info->colors_written;
1952    key->ps_epilog.color_types = info->output_color_types;
1953    key->ps_epilog.writes_z = info->writes_z;
1954    key->ps_epilog.writes_stencil = info->writes_stencil;
1955    key->ps_epilog.writes_samplemask = info->writes_samplemask;
1956    key->ps_epilog.states = shader->key.ps.part.epilog;
1957 }
1958 
1959 /**
1960  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
1961  */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct pipe_debug_callback * debug)1962 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1963                                       struct si_shader *shader, struct pipe_debug_callback *debug)
1964 {
1965    union si_shader_part_key prolog_key;
1966    union si_shader_part_key epilog_key;
1967 
1968    /* Get the prolog. */
1969    si_get_ps_prolog_key(shader, &prolog_key, true);
1970 
1971    /* The prolog is a no-op if these aren't set. */
1972    if (si_need_ps_prolog(&prolog_key)) {
1973       shader->prolog =
1974          si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
1975                             compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
1976       if (!shader->prolog)
1977          return false;
1978    }
1979 
1980    /* Get the epilog. */
1981    si_get_ps_epilog_key(shader, &epilog_key);
1982 
1983    shader->epilog =
1984       si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
1985                          compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
1986    if (!shader->epilog)
1987       return false;
1988 
1989    /* Enable POS_FIXED_PT if polygon stippling is enabled. */
1990    if (shader->key.ps.part.prolog.poly_stipple) {
1991       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
1992       assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
1993    }
1994 
1995    /* Set up the enable bits for per-sample shading if needed. */
1996    if (shader->key.ps.part.prolog.force_persp_sample_interp &&
1997        (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
1998         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
1999       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2000       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2001       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2002    }
2003    if (shader->key.ps.part.prolog.force_linear_sample_interp &&
2004        (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2005         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2006       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2007       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2008       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2009    }
2010    if (shader->key.ps.part.prolog.force_persp_center_interp &&
2011        (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2012         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2013       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2014       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2015       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2016    }
2017    if (shader->key.ps.part.prolog.force_linear_center_interp &&
2018        (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2019         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2020       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2021       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2022       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2023    }
2024 
2025    /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2026    if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2027        !(shader->config.spi_ps_input_ena & 0xf)) {
2028       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2029       assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
2030    }
2031 
2032    /* At least one pair of interpolation weights must be enabled. */
2033    if (!(shader->config.spi_ps_input_ena & 0x7f)) {
2034       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2035       assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
2036    }
2037 
2038    /* Samplemask fixup requires the sample ID. */
2039    if (shader->key.ps.part.prolog.samplemask_log_ps_iter) {
2040       shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2041       assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
2042    }
2043 
2044    return true;
2045 }
2046 
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)2047 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
2048 {
2049    /* If tessellation is all offchip and on-chip GS isn't used, this
2050     * workaround is not needed.
2051     */
2052    return;
2053 
2054    /* SPI barrier management bug:
2055     *   Make sure we have at least 4k of LDS in use to avoid the bug.
2056     *   It applies to workgroup sizes of more than one wavefront.
2057     */
2058    if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
2059       *lds_size = MAX2(*lds_size, 8);
2060 }
2061 
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)2062 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
2063 {
2064    unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
2065 
2066    shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
2067 
2068    if (shader->selector->info.stage == MESA_SHADER_COMPUTE &&
2069        si_get_max_workgroup_size(shader) > shader->wave_size) {
2070       si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
2071    }
2072 }
2073 
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct pipe_debug_callback * debug)2074 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2075                               struct si_shader *shader, struct pipe_debug_callback *debug)
2076 {
2077    struct si_shader_selector *sel = shader->selector;
2078    struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
2079 
2080    /* LS, ES, VS are compiled on demand if the main part hasn't been
2081     * compiled for that stage.
2082     *
2083     * GS are compiled on demand if the main part hasn't been compiled
2084     * for the chosen NGG-ness.
2085     *
2086     * Vertex shaders are compiled on demand when a vertex fetch
2087     * workaround must be applied.
2088     */
2089    if (shader->is_monolithic) {
2090       /* Monolithic shader (compiled as a whole, has many variants,
2091        * may take a long time to compile).
2092        */
2093       if (!si_compile_shader(sscreen, compiler, shader, debug))
2094          return false;
2095    } else {
2096       /* The shader consists of several parts:
2097        *
2098        * - the middle part is the user shader, it has 1 variant only
2099        *   and it was compiled during the creation of the shader
2100        *   selector
2101        * - the prolog part is inserted at the beginning
2102        * - the epilog part is inserted at the end
2103        *
2104        * The prolog and epilog have many (but simple) variants.
2105        *
2106        * Starting with gfx9, geometry and tessellation control
2107        * shaders also contain the prolog and user shader parts of
2108        * the previous shader stage.
2109        */
2110 
2111       if (!mainp)
2112          return false;
2113 
2114       /* Copy the compiled shader data over. */
2115       shader->is_binary_shared = true;
2116       shader->binary = mainp->binary;
2117       shader->config = mainp->config;
2118       shader->info = mainp->info;
2119 
2120       /* Select prologs and/or epilogs. */
2121       switch (sel->info.stage) {
2122       case MESA_SHADER_VERTEX:
2123          if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
2124             return false;
2125          break;
2126       case MESA_SHADER_TESS_CTRL:
2127          if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
2128             return false;
2129          break;
2130       case MESA_SHADER_TESS_EVAL:
2131          break;
2132       case MESA_SHADER_GEOMETRY:
2133          if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
2134             return false;
2135 
2136          /* Clone the GS copy shader for the shader variant.
2137           * We can't just copy the pointer because we change the pm4 state and
2138           * si_shader_selector::gs_copy_shader must be immutable because it's shared
2139           * by multiple contexts.
2140           */
2141          if (!shader->key.ge.as_ngg) {
2142             assert(sel->main_shader_part == mainp);
2143             assert(sel->main_shader_part->gs_copy_shader);
2144             assert(sel->main_shader_part->gs_copy_shader->bo);
2145             assert(!sel->main_shader_part->gs_copy_shader->previous_stage_sel);
2146             assert(!sel->main_shader_part->gs_copy_shader->scratch_bo);
2147 
2148             shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
2149             memcpy(shader->gs_copy_shader, sel->main_shader_part->gs_copy_shader,
2150                    sizeof(*shader->gs_copy_shader));
2151             /* Increase the reference count. */
2152             pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
2153             /* Initialize some fields differently. */
2154             shader->gs_copy_shader->shader_log = NULL;
2155             shader->gs_copy_shader->is_binary_shared = true;
2156             util_queue_fence_init(&shader->gs_copy_shader->ready);
2157          }
2158          break;
2159       case MESA_SHADER_FRAGMENT:
2160          if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
2161             return false;
2162 
2163          /* Make sure we have at least as many VGPRs as there
2164           * are allocated inputs.
2165           */
2166          shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
2167          break;
2168       default:;
2169       }
2170 
2171       assert(shader->wave_size == mainp->wave_size);
2172       assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
2173 
2174       /* Update SGPR and VGPR counts. */
2175       if (shader->prolog) {
2176          shader->config.num_sgprs =
2177             MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
2178          shader->config.num_vgprs =
2179             MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
2180       }
2181       if (shader->previous_stage) {
2182          shader->config.num_sgprs =
2183             MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
2184          shader->config.num_vgprs =
2185             MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
2186          shader->config.spilled_sgprs =
2187             MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
2188          shader->config.spilled_vgprs =
2189             MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
2190          shader->info.private_mem_vgprs =
2191             MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
2192          shader->config.scratch_bytes_per_wave =
2193             MAX2(shader->config.scratch_bytes_per_wave,
2194                  shader->previous_stage->config.scratch_bytes_per_wave);
2195          shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
2196          shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
2197          shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
2198       }
2199       if (shader->epilog) {
2200          shader->config.num_sgprs =
2201             MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
2202          shader->config.num_vgprs =
2203             MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
2204       }
2205       si_calculate_max_simd_waves(shader);
2206    }
2207 
2208    if (sel->info.stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
2209       assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
2210       if (!gfx10_ngg_calculate_subgroup_info(shader)) {
2211          fprintf(stderr, "Failed to compute subgroup info\n");
2212          return false;
2213       }
2214    } else if (sscreen->info.chip_class >= GFX9 && sel->info.stage == MESA_SHADER_GEOMETRY) {
2215       gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
2216    }
2217 
2218    shader->uses_vs_state_provoking_vertex =
2219       sscreen->use_ngg &&
2220       /* Used to convert triangle strips from GS to triangles. */
2221       ((sel->info.stage == MESA_SHADER_GEOMETRY &&
2222         util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
2223        (sel->info.stage == MESA_SHADER_VERTEX &&
2224         /* Used to export PrimitiveID from the correct vertex. */
2225         shader->key.ge.mono.u.vs_export_prim_id));
2226 
2227    shader->uses_vs_state_outprim = sscreen->use_ngg &&
2228                                    /* Only used by streamout in vertex shaders. */
2229                                    sel->info.stage == MESA_SHADER_VERTEX &&
2230                                    sel->so.num_outputs;
2231 
2232    if (sel->info.stage == MESA_SHADER_VERTEX) {
2233       shader->uses_base_instance = sel->info.uses_base_instance ||
2234                                    shader->key.ge.part.vs.prolog.instance_divisor_is_one ||
2235                                    shader->key.ge.part.vs.prolog.instance_divisor_is_fetched;
2236    } else if (sel->info.stage == MESA_SHADER_TESS_CTRL) {
2237       shader->uses_base_instance = shader->previous_stage_sel &&
2238                                    (shader->previous_stage_sel->info.uses_base_instance ||
2239                                     shader->key.ge.part.tcs.ls_prolog.instance_divisor_is_one ||
2240                                     shader->key.ge.part.tcs.ls_prolog.instance_divisor_is_fetched);
2241    } else if (sel->info.stage == MESA_SHADER_GEOMETRY) {
2242       shader->uses_base_instance = shader->previous_stage_sel &&
2243                                    (shader->previous_stage_sel->info.uses_base_instance ||
2244                                     shader->key.ge.part.gs.vs_prolog.instance_divisor_is_one ||
2245                                     shader->key.ge.part.gs.vs_prolog.instance_divisor_is_fetched);
2246    }
2247 
2248    si_fix_resource_usage(sscreen, shader);
2249    si_shader_dump(sscreen, shader, debug, stderr, true);
2250 
2251    /* Upload. */
2252    if (!si_shader_binary_upload(sscreen, shader, 0)) {
2253       fprintf(stderr, "LLVM failed to upload shader\n");
2254       return false;
2255    }
2256 
2257    return true;
2258 }
2259 
si_shader_binary_clean(struct si_shader_binary * binary)2260 void si_shader_binary_clean(struct si_shader_binary *binary)
2261 {
2262    free((void *)binary->elf_buffer);
2263    binary->elf_buffer = NULL;
2264 
2265    free(binary->llvm_ir_string);
2266    binary->llvm_ir_string = NULL;
2267 
2268    free(binary->uploaded_code);
2269    binary->uploaded_code = NULL;
2270    binary->uploaded_code_size = 0;
2271 }
2272 
si_shader_destroy(struct si_shader * shader)2273 void si_shader_destroy(struct si_shader *shader)
2274 {
2275    if (shader->scratch_bo)
2276       si_resource_reference(&shader->scratch_bo, NULL);
2277 
2278    si_resource_reference(&shader->bo, NULL);
2279 
2280    if (!shader->is_binary_shared)
2281       si_shader_binary_clean(&shader->binary);
2282 
2283    free(shader->shader_log);
2284 }
2285